Merge pull request #423 from ICLDisco/tuned
Dismantle the Tuned collective
Этот коммит содержится в:
Коммит
cf56c6a9f2
@ -2,7 +2,7 @@
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -20,10 +20,26 @@ dist_ompidata_DATA = base/help-mca-coll-base.txt
|
||||
|
||||
headers += \
|
||||
base/base.h \
|
||||
base/coll_tags.h
|
||||
base/coll_tags.h \
|
||||
base/coll_base_topo.h \
|
||||
base/coll_base_util.h \
|
||||
base/coll_base_functions.h
|
||||
|
||||
libmca_coll_la_SOURCES += \
|
||||
base/coll_base_comm_select.c \
|
||||
base/coll_base_comm_unselect.c \
|
||||
base/coll_base_find_available.c \
|
||||
base/coll_base_frame.c
|
||||
base/coll_base_frame.c \
|
||||
base/coll_base_bcast.c \
|
||||
base/coll_base_scatter.c \
|
||||
base/coll_base_topo.c \
|
||||
base/coll_base_allgather.c \
|
||||
base/coll_base_allgatherv.c \
|
||||
base/coll_base_util.c \
|
||||
base/coll_base_allreduce.c \
|
||||
base/coll_base_alltoall.c \
|
||||
base/coll_base_gather.c \
|
||||
base/coll_base_alltoallv.c \
|
||||
base/coll_base_reduce.c \
|
||||
base/coll_base_barrier.c \
|
||||
base/coll_base_reduce_scatter.c
|
||||
|
@ -87,7 +87,7 @@ int mca_coll_base_find_available(bool enable_progress_threads,
|
||||
* coll component needs to be selected for it. It should be invoked
|
||||
* near the end of the communicator creation process such that
|
||||
* almost everything else is functional on the communicator (e.g.,
|
||||
* point-to-point communication).
|
||||
* point-to-point communication).
|
||||
*
|
||||
* Note that new communicators may be created as a result of
|
||||
* invoking this function. Specifically: this function is called in
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -30,31 +30,12 @@
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_util.h"
|
||||
|
||||
/* allgather algorithm variables */
|
||||
static int coll_tuned_allgather_algorithm_count = 6;
|
||||
static int coll_tuned_allgather_forced_algorithm = 0;
|
||||
static int coll_tuned_allgather_segment_size = 0;
|
||||
static int coll_tuned_allgather_tree_fanout;
|
||||
static int coll_tuned_allgather_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_allgather_forced_algorithm */
|
||||
static mca_base_var_enum_value_t allgather_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "linear"},
|
||||
{2, "bruck"},
|
||||
{3, "recursive_doubling"},
|
||||
{4, "ring"},
|
||||
{5, "neighbor"},
|
||||
{6, "two_proc"},
|
||||
{0, NULL}
|
||||
};
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allgather_intra_bruck
|
||||
* ompi_coll_base_allgather_intra_bruck
|
||||
*
|
||||
* Function: allgather using O(log(N)) steps.
|
||||
* Accepts: Same arguments as MPI_Allgather
|
||||
@ -65,7 +46,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = {
|
||||
* in Multiport Message-Passing Systems"
|
||||
* Memory requirements: non-zero ranks require shift buffer to perform final
|
||||
* step in the algorithm.
|
||||
*
|
||||
*
|
||||
* Example on 6 nodes:
|
||||
* Initialization: everyone has its own buffer at location 0 in rbuf
|
||||
* This means if user specified MPI_IN_PLACE for sendbuf
|
||||
@ -84,7 +65,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = {
|
||||
* [2] [3] [4] [5] [0] [1]
|
||||
* [3] [4] [5] [0] [1] [2]
|
||||
* Step 2: send message to (rank - 2^2), receive message from (rank + 2^2)
|
||||
* message size is "all remaining blocks"
|
||||
* message size is "all remaining blocks"
|
||||
* # 0 1 2 3 4 5
|
||||
* [0] [1] [2] [3] [4] [5]
|
||||
* [1] [2] [3] [4] [5] [0]
|
||||
@ -101,7 +82,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = {
|
||||
* [4] [4] [4] [4] [4] [4]
|
||||
* [5] [5] [5] [5] [5] [5]
|
||||
*/
|
||||
int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
int ompi_coll_base_allgather_intra_bruck(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -115,8 +96,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_bruck rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgather_intra_bruck rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
@ -125,7 +106,7 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Initialization step:
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
|
||||
receive buffer, else
|
||||
- if rank r != 0, copy r^th block from receive buffer to block 0.
|
||||
*/
|
||||
@ -140,15 +121,15 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend);
|
||||
if (err < 0) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
|
||||
/* Communication step:
|
||||
At every step i, rank r:
|
||||
- doubles the distance
|
||||
- sends message which starts at begining of rbuf and has size
|
||||
- sends message which starts at begining of rbuf and has size
|
||||
(blockcount * rcount) to rank (r - distance)
|
||||
- receives message of size blockcount * rcount from rank (r + distance)
|
||||
at location (rbuf + distance * rcount * rext)
|
||||
- blockcount doubles until last step when only the remaining data is
|
||||
- blockcount doubles until last step when only the remaining data is
|
||||
exchanged.
|
||||
*/
|
||||
blockcount = 1;
|
||||
@ -162,14 +143,14 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
|
||||
if (distance <= (size >> 1)) {
|
||||
blockcount = distance;
|
||||
} else {
|
||||
} else {
|
||||
blockcount = size - distance;
|
||||
}
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, blockcount * rcount, rdtype,
|
||||
err = ompi_coll_base_sendrecv(tmpsend, blockcount * rcount, rdtype,
|
||||
sendto, MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
tmprecv, blockcount * rcount, rdtype,
|
||||
tmprecv, blockcount * rcount, rdtype,
|
||||
recvfrom, MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
@ -178,8 +159,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
|
||||
/* Finalization step:
|
||||
On all nodes except 0, data needs to be shifted locally:
|
||||
- create temporary shift buffer,
|
||||
see discussion in coll_basic_reduce.c about the size and begining
|
||||
- create temporary shift buffer,
|
||||
see discussion in coll_basic_reduce.c about the size and begining
|
||||
of temporary buffer.
|
||||
- copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer
|
||||
- move blocks [(size - rank) .. size] from rbuf to begining of rbuf
|
||||
@ -195,8 +176,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
free_buf = (char*) calloc(((true_extent +
|
||||
((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount - 1) * rext)),
|
||||
sizeof(char));
|
||||
if (NULL == free_buf) {
|
||||
line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl;
|
||||
if (NULL == free_buf) {
|
||||
line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl;
|
||||
}
|
||||
shift_buf = free_buf - true_lb;
|
||||
|
||||
@ -207,13 +188,13 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
|
||||
/* 2. move blocks [(size - rank) .. size] from rbuf to the begining of rbuf */
|
||||
tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount * rext;
|
||||
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount,
|
||||
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount,
|
||||
rbuf, tmpsend);
|
||||
if (err < 0) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* 3. copy blocks from shift buffer back to rbuf starting at block [rank]. */
|
||||
tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
|
||||
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount,
|
||||
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount,
|
||||
tmprecv, shift_buf);
|
||||
if (err < 0) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
@ -223,13 +204,13 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allgather_intra_recursivedoubling
|
||||
* ompi_coll_base_allgather_intra_recursivedoubling
|
||||
*
|
||||
* Function: allgather using O(log(N)) steps.
|
||||
* Accepts: Same arguments as MPI_Allgather
|
||||
@ -239,29 +220,29 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
* This algorithm is used in MPICH-2 for small- and medium-sized
|
||||
* messages on power-of-two processes.
|
||||
*
|
||||
* Limitation: Current implementation only works on power-of-two number of
|
||||
* processes.
|
||||
* Limitation: Current implementation only works on power-of-two number of
|
||||
* processes.
|
||||
* In case this algorithm is invoked on non-power-of-two
|
||||
* processes, Bruck algorithm will be invoked.
|
||||
*
|
||||
*
|
||||
* Memory requirements:
|
||||
* No additional memory requirements beyond user-supplied buffers.
|
||||
*
|
||||
*
|
||||
* Example on 4 nodes:
|
||||
* Initialization: everyone has its own buffer at location rank in rbuf
|
||||
* # 0 1 2 3
|
||||
* # 0 1 2 3
|
||||
* [0] [ ] [ ] [ ]
|
||||
* [ ] [1] [ ] [ ]
|
||||
* [ ] [ ] [2] [ ]
|
||||
* [ ] [ ] [ ] [3]
|
||||
* Step 0: exchange data with (rank ^ 2^0)
|
||||
* # 0 1 2 3
|
||||
* # 0 1 2 3
|
||||
* [0] [0] [ ] [ ]
|
||||
* [1] [1] [ ] [ ]
|
||||
* [ ] [ ] [2] [2]
|
||||
* [ ] [ ] [3] [3]
|
||||
* Step 1: exchange data with (rank ^ 2^1) (if you can)
|
||||
* # 0 1 2 3
|
||||
* # 0 1 2 3
|
||||
* [0] [0] [0] [0]
|
||||
* [1] [1] [1] [1]
|
||||
* [2] [2] [2] [2]
|
||||
@ -269,12 +250,12 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
|
||||
*
|
||||
* TODO: Modify the algorithm to work with any number of nodes.
|
||||
* We can modify code to use identical implementation like MPICH-2:
|
||||
* - using recursive-halving algorithm, at the end of each step,
|
||||
* - using recursive-halving algorithm, at the end of each step,
|
||||
* determine if there are nodes who did not exchange their data in that
|
||||
* step, and send them appropriate messages.
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
|
||||
int
|
||||
ompi_coll_base_allgather_intra_recursivedoubling(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -293,21 +274,21 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
|
||||
pow2size >>=1;
|
||||
|
||||
/* Current implementation only handles power-of-two number of processes.
|
||||
If the function was called on non-power-of-two number of processes,
|
||||
If the function was called on non-power-of-two number of processes,
|
||||
print warning and call bruck allgather algorithm with same parameters.
|
||||
*/
|
||||
if (pow2size != size) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm",
|
||||
size));
|
||||
|
||||
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_recursivedoubling rank %d, size %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgather_intra_recursivedoubling rank %d, size %d",
|
||||
rank, size));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
||||
@ -317,7 +298,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Initialization step:
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
|
||||
receive buffer
|
||||
*/
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
@ -326,8 +307,8 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
|
||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Communication step:
|
||||
At every step i, rank r:
|
||||
- exchanges message with rank remote = (r ^ 2^i).
|
||||
@ -347,7 +328,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
|
||||
}
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
|
||||
err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
|
||||
remote, MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
tmprecv, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
|
||||
remote, MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
@ -359,7 +340,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
@ -367,7 +348,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
|
||||
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allgather_intra_ring
|
||||
* ompi_coll_base_allgather_intra_ring
|
||||
*
|
||||
* Function: allgather using O(N) steps.
|
||||
* Accepts: Same arguments as MPI_Allgather
|
||||
@ -379,9 +360,9 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
|
||||
* (r + 1) containing data from rank (r - i), with wrap arounds.
|
||||
* Memory requirements:
|
||||
* No additional memory requirements.
|
||||
*
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
|
||||
int ompi_coll_base_allgather_intra_ring(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -395,8 +376,8 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_ring rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgather_intra_ring rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
@ -413,15 +394,15 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
|
||||
tmpsend = (char*) sbuf;
|
||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Communication step:
|
||||
At every step i: 0 .. (P-1), rank r:
|
||||
- receives message from [(r - 1 + size) % size] containing data from rank
|
||||
[(r - i - 1 + size) % size]
|
||||
- sends message to rank [(r + 1) % size] containing data from rank
|
||||
[(r - i + size) % size]
|
||||
- sends message which starts at begining of rbuf and has size
|
||||
- sends message which starts at begining of rbuf and has size
|
||||
*/
|
||||
sendto = (rank + 1) % size;
|
||||
recvfrom = (rank - 1 + size) % size;
|
||||
@ -434,7 +415,7 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
|
||||
tmpsend = (char*)rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)rcount * rext;
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, sendto,
|
||||
err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, sendto,
|
||||
MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
tmprecv, rcount, rdtype, recvfrom,
|
||||
MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
@ -446,34 +427,34 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allgather_intra_neighborexchange
|
||||
* ompi_coll_base_allgather_intra_neighborexchange
|
||||
*
|
||||
* Function: allgather using N/2 steps (O(N))
|
||||
* Accepts: Same arguments as MPI_Allgather
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
*
|
||||
* Description: Neighbor Exchange algorithm for allgather.
|
||||
* Described by Chen et.al. in
|
||||
* "Performance Evaluation of Allgather Algorithms on
|
||||
* Described by Chen et.al. in
|
||||
* "Performance Evaluation of Allgather Algorithms on
|
||||
* Terascale Linux Cluster with Fast Ethernet",
|
||||
* Proceedings of the Eighth International Conference on
|
||||
* Proceedings of the Eighth International Conference on
|
||||
* High-Performance Computing inn Asia-Pacific Region
|
||||
* (HPCASIA'05), 2005
|
||||
*
|
||||
*
|
||||
* Rank r exchanges message with one of its neighbors and
|
||||
* forwards the data further in the next step.
|
||||
*
|
||||
* No additional memory requirements.
|
||||
*
|
||||
*
|
||||
* Limitations: Algorithm works only on even number of processes.
|
||||
* For odd number of processes we switch to ring algorithm.
|
||||
*
|
||||
*
|
||||
* Example on 6 nodes:
|
||||
* Initial state
|
||||
* # 0 1 2 3 4 5
|
||||
@ -508,8 +489,8 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
|
||||
* [4] [4] [4] [4] [4] [4]
|
||||
* [5] [5] [5] [5] [5] [5]
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
|
||||
int
|
||||
ompi_coll_base_allgather_intra_neighborexchange(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -525,16 +506,16 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
if (size % 2) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
|
||||
size));
|
||||
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_neighborexchange rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgather_intra_neighborexchange rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
@ -551,7 +532,7 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
|
||||
tmpsend = (char*) sbuf;
|
||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
}
|
||||
|
||||
/* Determine neighbors, order in which blocks will arrive, etc. */
|
||||
even_rank = !(rank % 2);
|
||||
@ -573,15 +554,15 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
|
||||
|
||||
/* Communication loop:
|
||||
- First step is special: exchange a single block with neighbor[0].
|
||||
- Rest of the steps:
|
||||
update recv_data_from according to offset, and
|
||||
- Rest of the steps:
|
||||
update recv_data_from according to offset, and
|
||||
exchange two blocks with appropriate neighbor.
|
||||
the send location becomes previous receve location.
|
||||
*/
|
||||
tmprecv = (char*)rbuf + (ptrdiff_t)neighbor[0] * (ptrdiff_t)rcount * rext;
|
||||
tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
|
||||
err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
|
||||
MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
tmprecv, rcount, rdtype, neighbor[0],
|
||||
MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
@ -597,15 +578,15 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
|
||||
|
||||
for (i = 1; i < (size / 2); i++) {
|
||||
const int i_parity = i % 2;
|
||||
recv_data_from[i_parity] =
|
||||
recv_data_from[i_parity] =
|
||||
(recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
|
||||
|
||||
tmprecv = (char*)rbuf + (ptrdiff_t)recv_data_from[i_parity] * (ptrdiff_t)rcount * rext;
|
||||
tmpsend = (char*)rbuf + (ptrdiff_t)send_data_from * rcount * rext;
|
||||
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
|
||||
neighbor[i_parity],
|
||||
err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
|
||||
neighbor[i_parity],
|
||||
MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
tmprecv, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
|
||||
neighbor[i_parity],
|
||||
@ -619,13 +600,13 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
|
||||
int ompi_coll_base_allgather_intra_two_procs(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -638,8 +619,8 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_allgather_intra_two_procs rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_allgather_intra_two_procs rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
@ -661,7 +642,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
|
||||
}
|
||||
tmprecv = (char*)rbuf + (ptrdiff_t)remote * (ptrdiff_t)rcount * rext;
|
||||
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote,
|
||||
err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote,
|
||||
MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
tmprecv, rcount, rdtype, remote,
|
||||
MCA_COLL_BASE_TAG_ALLGATHER,
|
||||
@ -670,7 +651,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
|
||||
|
||||
/* Place your data in correct location if necessary */
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
|
||||
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
|
||||
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
@ -678,7 +659,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
@ -687,13 +668,13 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
|
||||
/*
|
||||
* Linear functions are copied from the BASIC coll module
|
||||
* they do not segment the message and are simple implementations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as tuned/tree based segmenting operations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as base/tree based segmenting operations
|
||||
* and as such may be selected by the decision functions
|
||||
* These are copied into this module due to the way we select modules
|
||||
* in V1. i.e. in V2 we will handle this differently and so will not
|
||||
* have to duplicate code.
|
||||
* JPG following the examples from other coll_tuned implementations. Dec06.
|
||||
* JPG following the examples from other coll_base implementations. Dec06.
|
||||
*/
|
||||
|
||||
/* copied function (with appropriate renaming) starts here */
|
||||
@ -706,10 +687,10 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
ompi_coll_base_allgather_intra_basic_linear(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf,
|
||||
int rcount,
|
||||
int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
@ -727,7 +708,7 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount,
|
||||
sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount);
|
||||
sdtype = rdtype;
|
||||
scount = rcount;
|
||||
}
|
||||
}
|
||||
|
||||
/* Gather and broadcast. */
|
||||
|
||||
@ -755,183 +736,3 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount,
|
||||
}
|
||||
|
||||
/* copied function (with appropriate renaming) ends here */
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map
|
||||
routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values
|
||||
and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int
|
||||
ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHER] = coll_tuned_allgather_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm_count",
|
||||
"Number of allgather algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_allgather_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_allgather_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_allgather_algorithms", allgather_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm",
|
||||
"Which allallgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgather_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_allgather_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allgather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgather_segment_size);
|
||||
|
||||
coll_tuned_allgather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allgather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgather_tree_fanout);
|
||||
|
||||
coll_tuned_allgather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allgather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgather_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_allgather_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_do_forced selected algorithm %d",
|
||||
data->user_forced[ALLGATHER].algorithm));
|
||||
|
||||
switch (data->user_forced[ALLGATHER].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_allgather_intra_basic_linear (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_allgather_intra_bruck (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (3):
|
||||
return ompi_coll_tuned_allgather_intra_recursivedoubling (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (4):
|
||||
return ompi_coll_tuned_allgather_intra_ring (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (5):
|
||||
return ompi_coll_tuned_allgather_intra_neighborexchange (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (6):
|
||||
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
data->user_forced[ALLGATHER].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allgather_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_allgather_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (3):
|
||||
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (4):
|
||||
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (5):
|
||||
return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (6):
|
||||
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
}
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2012 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -30,19 +30,12 @@
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_util.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
/* allgatherv algorithm variables */
|
||||
static int coll_tuned_allgatherv_algorithm_count = 5;
|
||||
static int coll_tuned_allgatherv_forced_algorithm = 0;
|
||||
static int coll_tuned_allgatherv_segment_size = 0;
|
||||
static int coll_tuned_allgatherv_tree_fanout;
|
||||
static int coll_tuned_allgatherv_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_allgatherv_forced_algorithm */
|
||||
static mca_base_var_enum_value_t allgatherv_algorithms[] = {
|
||||
/* valid values for coll_base_allgatherv_forced_algorithm */
|
||||
mca_base_var_enum_value_t coll_base_allgatherv_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "default"},
|
||||
{2, "bruck"},
|
||||
@ -53,7 +46,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
|
||||
};
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allgatherv_intra_bruck
|
||||
* ompi_coll_base_allgatherv_intra_bruck
|
||||
*
|
||||
* Function: allgather using O(log(N)) steps.
|
||||
* Accepts: Same arguments as MPI_Allgather
|
||||
@ -64,7 +57,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
|
||||
* in Multiport Message-Passing Systems"
|
||||
* Note: Unlike in case of allgather implementation, we relay on
|
||||
* indexed datatype to select buffers appropriately.
|
||||
* The only additional memory requirement is for creation of
|
||||
* The only additional memory requirement is for creation of
|
||||
* temporary datatypes.
|
||||
* Example on 7 nodes (memory lay out need not be in-order)
|
||||
* Initial set up:
|
||||
@ -86,7 +79,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
|
||||
* [ ] [ ] [ ] [ ] [5] [5] [ ]
|
||||
* [ ] [ ] [ ] [ ] [ ] [6] [6]
|
||||
* Step 1: send message to (rank - 2^1), receive message from (rank + 2^1).
|
||||
* message contains all blocks from (rank) .. (rank + 2^2) with
|
||||
* message contains all blocks from (rank) .. (rank + 2^2) with
|
||||
* wrap around.
|
||||
* # 0 1 2 3 4 5 6
|
||||
* [0] [ ] [ ] [ ] [0] [0] [0]
|
||||
@ -97,7 +90,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
|
||||
* [ ] [ ] [5] [5] [5] [5] [ ]
|
||||
* [ ] [ ] [ ] [6] [6] [6] [6]
|
||||
* Step 2: send message to (rank - 2^2), receive message from (rank + 2^2).
|
||||
* message size is "all remaining blocks"
|
||||
* message size is "all remaining blocks"
|
||||
* # 0 1 2 3 4 5 6
|
||||
* [0] [0] [0] [0] [0] [0] [0]
|
||||
* [1] [1] [1] [1] [1] [1] [1]
|
||||
@ -107,10 +100,10 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
|
||||
* [5] [5] [5] [5] [5] [5] [5]
|
||||
* [6] [6] [6] [6] [6] [6] [6]
|
||||
*/
|
||||
int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
int ompi_coll_base_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
@ -124,9 +117,9 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_bruck rank %d", rank));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgather_intra_bruck rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
@ -134,27 +127,27 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Initialization step:
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
|
||||
the receive buffer.
|
||||
*/
|
||||
tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext;
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
tmpsend = (char*) sbuf;
|
||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
||||
tmprecv, rcounts[rank], rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* Communication step:
|
||||
At every step i, rank r:
|
||||
- doubles the distance
|
||||
- sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
|
||||
to rank (r - distance)
|
||||
- receives message of blockcount blocks,
|
||||
(rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
|
||||
- receives message of blockcount blocks,
|
||||
(rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
|
||||
rank (r + distance)
|
||||
- blockcount doubles until the last step when only the remaining data is
|
||||
- blockcount doubles until the last step when only the remaining data is
|
||||
exchanged.
|
||||
*/
|
||||
blockcount = 1;
|
||||
@ -173,7 +166,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
|
||||
if (distance <= (size >> 1)) {
|
||||
blockcount = distance;
|
||||
} else {
|
||||
} else {
|
||||
blockcount = size - distance;
|
||||
}
|
||||
|
||||
@ -186,7 +179,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
new_rcounts[i] = rcounts[tmp_rrank];
|
||||
new_rdispls[i] = rdispls[tmp_rrank];
|
||||
}
|
||||
err = ompi_datatype_create_indexed(blockcount, new_scounts, new_sdispls,
|
||||
err = ompi_datatype_create_indexed(blockcount, new_scounts, new_sdispls,
|
||||
rdtype, &new_sdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
err = ompi_datatype_create_indexed(blockcount, new_rcounts, new_rdispls,
|
||||
@ -198,7 +191,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv(rbuf, 1, new_sdtype, sendto,
|
||||
err = ompi_coll_base_sendrecv(rbuf, 1, new_sdtype, sendto,
|
||||
MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
rbuf, 1, new_rdtype, recvfrom,
|
||||
MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
@ -207,7 +200,6 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
|
||||
ompi_datatype_destroy(&new_sdtype);
|
||||
ompi_datatype_destroy(&new_rdtype);
|
||||
|
||||
}
|
||||
|
||||
free(new_rcounts);
|
||||
@ -217,14 +209,14 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
err_hndl:
|
||||
if( NULL != new_rcounts ) free(new_rcounts);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allgatherv_intra_ring
|
||||
* ompi_coll_base_allgatherv_intra_ring
|
||||
*
|
||||
* Function: allgatherv using O(N) steps.
|
||||
* Accepts: Same arguments as MPI_Allgatherv
|
||||
@ -236,9 +228,9 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
||||
* (r + 1) containing data from rank (r - i), with wrap arounds.
|
||||
* Memory requirements:
|
||||
* No additional memory requirements.
|
||||
*
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
|
||||
int ompi_coll_base_allgatherv_intra_ring(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -252,8 +244,8 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_ring rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgatherv_intra_ring rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
@ -262,24 +254,24 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Initialization step:
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to
|
||||
the appropriate block of receive buffer
|
||||
*/
|
||||
tmprecv = (char*) rbuf + (ptrdiff_t)rdisps[rank] * rext;
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
tmpsend = (char*) sbuf;
|
||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
||||
tmprecv, rcounts[rank], rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Communication step:
|
||||
At every step i: 0 .. (P-1), rank r:
|
||||
- receives message from [(r - 1 + size) % size] containing data from rank
|
||||
[(r - i - 1 + size) % size]
|
||||
- sends message to rank [(r + 1) % size] containing data from rank
|
||||
[(r - i + size) % size]
|
||||
- sends message which starts at begining of rbuf and has size
|
||||
- sends message which starts at begining of rbuf and has size
|
||||
*/
|
||||
sendto = (rank + 1) % size;
|
||||
recvfrom = (rank - 1 + size) % size;
|
||||
@ -292,47 +284,46 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
|
||||
tmpsend = (char*)rbuf + rdisps[senddatafrom] * rext;
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, rcounts[senddatafrom], rdtype,
|
||||
err = ompi_coll_base_sendrecv(tmpsend, rcounts[senddatafrom], rdtype,
|
||||
sendto, MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
tmprecv, rcounts[recvdatafrom], rdtype,
|
||||
tmprecv, rcounts[recvdatafrom], rdtype,
|
||||
recvfrom, MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allgatherv_intra_neighborexchange
|
||||
* ompi_coll_base_allgatherv_intra_neighborexchange
|
||||
*
|
||||
* Function: allgatherv using N/2 steps (O(N))
|
||||
* Accepts: Same arguments as MPI_Allgatherv
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
*
|
||||
* Description: Neighbor Exchange algorithm for allgather adapted for
|
||||
* Description: Neighbor Exchange algorithm for allgather adapted for
|
||||
* allgatherv.
|
||||
* Described by Chen et.al. in
|
||||
* "Performance Evaluation of Allgather Algorithms on
|
||||
* Described by Chen et.al. in
|
||||
* "Performance Evaluation of Allgather Algorithms on
|
||||
* Terascale Linux Cluster with Fast Ethernet",
|
||||
* Proceedings of the Eighth International Conference on
|
||||
* Proceedings of the Eighth International Conference on
|
||||
* High-Performance Computing inn Asia-Pacific Region
|
||||
* (HPCASIA'05), 2005
|
||||
*
|
||||
*
|
||||
* Rank r exchanges message with one of its neighbors and
|
||||
* forwards the data further in the next step.
|
||||
*
|
||||
* No additional memory requirements.
|
||||
*
|
||||
*
|
||||
* Limitations: Algorithm works only on even number of processes.
|
||||
* For odd number of processes we switch to ring algorithm.
|
||||
*
|
||||
*
|
||||
* Example on 6 nodes:
|
||||
* Initial state
|
||||
* # 0 1 2 3 4 5
|
||||
@ -367,8 +358,8 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
|
||||
* [4] [4] [4] [4] [4] [4]
|
||||
* [5] [5] [5] [5] [5] [5]
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
int
|
||||
ompi_coll_base_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts, int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -386,17 +377,17 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
if (size % 2) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgatherv_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
|
||||
size));
|
||||
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_neighborexchange rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allgatherv_intra_neighborexchange rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
@ -405,16 +396,16 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Initialization step:
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to
|
||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to
|
||||
the appropriate block of receive buffer
|
||||
*/
|
||||
tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext;
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
tmpsend = (char*) sbuf;
|
||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
||||
tmprecv, rcounts[rank], rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
}
|
||||
|
||||
/* Determine neighbors, order in which blocks will arrive, etc. */
|
||||
even_rank = !(rank % 2);
|
||||
@ -436,8 +427,8 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
|
||||
/* Communication loop:
|
||||
- First step is special: exchange a single block with neighbor[0].
|
||||
- Rest of the steps:
|
||||
update recv_data_from according to offset, and
|
||||
- Rest of the steps:
|
||||
update recv_data_from according to offset, and
|
||||
exchange two blocks with appropriate neighbor.
|
||||
the send location becomes previous receve location.
|
||||
Note, we need to create indexed datatype to send and receive these
|
||||
@ -445,13 +436,13 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
*/
|
||||
tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[neighbor[0]] * rext;
|
||||
tmpsend = (char*)rbuf + (ptrdiff_t)rdispls[rank] * rext;
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, rcounts[rank], rdtype,
|
||||
err = ompi_coll_base_sendrecv(tmpsend, rcounts[rank], rdtype,
|
||||
neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
tmprecv, rcounts[neighbor[0]], rdtype,
|
||||
tmprecv, rcounts[neighbor[0]], rdtype,
|
||||
neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
|
||||
/* Determine initial sending counts and displacements*/
|
||||
if (even_rank) {
|
||||
send_data_from = rank;
|
||||
@ -461,7 +452,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
|
||||
for (i = 1; i < (size / 2); i++) {
|
||||
const int i_parity = i % 2;
|
||||
recv_data_from[i_parity] =
|
||||
recv_data_from[i_parity] =
|
||||
(recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
|
||||
|
||||
/* Create new indexed types for sending and receiving.
|
||||
@ -473,7 +464,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
new_scounts[1] = rcounts[(send_data_from + 1)];
|
||||
new_sdispls[0] = rdispls[send_data_from];
|
||||
new_sdispls[1] = rdispls[(send_data_from + 1)];
|
||||
err = ompi_datatype_create_indexed(2, new_scounts, new_sdispls, rdtype,
|
||||
err = ompi_datatype_create_indexed(2, new_scounts, new_sdispls, rdtype,
|
||||
&new_sdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
err = ompi_datatype_commit(&new_sdtype);
|
||||
@ -483,17 +474,17 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
|
||||
new_rdispls[0] = rdispls[recv_data_from[i_parity]];
|
||||
new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
|
||||
err = ompi_datatype_create_indexed(2, new_rcounts, new_rdispls, rdtype,
|
||||
err = ompi_datatype_create_indexed(2, new_rcounts, new_rdispls, rdtype,
|
||||
&new_rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
err = ompi_datatype_commit(&new_rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
|
||||
tmprecv = (char*)rbuf;
|
||||
tmpsend = (char*)rbuf;
|
||||
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
|
||||
err = ompi_coll_base_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
|
||||
MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
tmprecv, 1, new_rdtype, neighbor[i_parity],
|
||||
MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
@ -501,7 +492,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
send_data_from = recv_data_from[i_parity];
|
||||
|
||||
|
||||
ompi_datatype_destroy(&new_sdtype);
|
||||
ompi_datatype_destroy(&new_rdtype);
|
||||
}
|
||||
@ -509,13 +500,13 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
|
||||
int ompi_coll_base_allgatherv_intra_two_procs(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
@ -529,8 +520,8 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_allgatherv_intra_two_procs rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_allgatherv_intra_two_procs rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
@ -552,7 +543,7 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
|
||||
}
|
||||
tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[remote] * rext;
|
||||
|
||||
err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote,
|
||||
err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote,
|
||||
MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
tmprecv, rcounts[remote], rdtype, remote,
|
||||
MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||
@ -561,16 +552,16 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
|
||||
|
||||
/* Place your data in correct location if necessary */
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
|
||||
(char*)rbuf + (ptrdiff_t)rdispls[rank] * rext,
|
||||
rcounts[rank], rdtype);
|
||||
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
|
||||
(char*)rbuf + (ptrdiff_t)rdispls[rank] * rext,
|
||||
rcounts[rank], rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
@ -579,13 +570,13 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
|
||||
/*
|
||||
* Linear functions are copied from the BASIC coll module
|
||||
* they do not segment the message and are simple implementations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as tuned/tree based segmenting operations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as base/tree based segmenting operations
|
||||
* and as such may be selected by the decision functions
|
||||
* These are copied into this module due to the way we select modules
|
||||
* in V1. i.e. in V2 we will handle this differently and so will not
|
||||
* have to duplicate code.
|
||||
* JPG following the examples from other coll_tuned implementations. Dec06.
|
||||
* JPG following the examples from other coll_base implementations. Dec06.
|
||||
*/
|
||||
|
||||
/* copied function (with appropriate renaming) starts here */
|
||||
@ -593,19 +584,19 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
|
||||
/*
|
||||
* allgatherv_intra_basic
|
||||
*
|
||||
* Function: - allgatherv using other MPI collectives:
|
||||
* Function: - allgatherv using other MPI collectives:
|
||||
* gatherv + bcast (from basic module).
|
||||
* Accepts: - same as MPI_Allgatherv()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
int *disps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
ompi_coll_base_allgatherv_intra_basic_default(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
int *disps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, size, rank, err;
|
||||
MPI_Aint extent, lb;
|
||||
@ -619,8 +610,8 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
|
||||
* to process with rank 0 (OMPI convention)
|
||||
*/
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_allgatherv_intra_basic_default rank %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_allgatherv_intra_basic_default rank %d",
|
||||
rank));
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
@ -639,7 +630,6 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
|
||||
rcounts[rank], send_type,rbuf,
|
||||
rcounts, disps, rdtype, 0,
|
||||
comm, comm->c_coll.coll_gatherv_module);
|
||||
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
@ -648,7 +638,7 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
|
||||
* broadcast the data out to the other processes
|
||||
*
|
||||
* Need to define a datatype that captures the different vectors
|
||||
* from each process. MPI_TYPE_INDEXED with params
|
||||
* from each process. MPI_TYPE_INDEXED with params
|
||||
* size,rcount,displs,rdtype,newtype
|
||||
* should do the trick.
|
||||
* Use underlying ddt functions to create, and commit the
|
||||
@ -660,7 +650,7 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
err = ompi_datatype_commit(&newtype);
|
||||
if(MPI_SUCCESS != err) {
|
||||
return err;
|
||||
@ -675,178 +665,3 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
|
||||
}
|
||||
|
||||
/* copied function (with appropriate renaming) ends here */
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map
|
||||
routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values
|
||||
and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int
|
||||
ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV] = coll_tuned_allgatherv_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm_count",
|
||||
"Number of allgatherv algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_allgatherv_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_allgatherv_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_allgatherv_algorithms", allgatherv_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm",
|
||||
"Which allallgatherv algorithm is used. Can be locked down to choice of: 0 ignore, 1 default (allgathervv + bcast), 2 bruck, 3 ring, 4 neighbor exchange, 5: two proc only.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgatherv_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_allgatherv_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allgatherv algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgatherv_segment_size);
|
||||
|
||||
coll_tuned_allgatherv_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allgatherv algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgatherv_tree_fanout);
|
||||
|
||||
coll_tuned_allgatherv_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allgatherv algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgatherv_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_do_forced selected algorithm %d",
|
||||
data->user_forced[ALLGATHERV].algorithm));
|
||||
|
||||
switch (data->user_forced[ALLGATHERV].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_allgatherv_intra_basic_default (sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_allgatherv_intra_bruck (sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (3):
|
||||
return ompi_coll_tuned_allgatherv_intra_ring (sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (4):
|
||||
return ompi_coll_tuned_allgatherv_intra_neighborexchange (sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (5):
|
||||
return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
data->user_forced[ALLGATHERV].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout,
|
||||
int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allgatherv_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_allgatherv_intra_basic_default(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (3):
|
||||
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (4):
|
||||
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (5):
|
||||
return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
}
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -31,41 +31,23 @@
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_util.h"
|
||||
|
||||
/* allreduce algorithm variables */
|
||||
static int coll_tuned_allreduce_algorithm_count = 5;
|
||||
static int coll_tuned_allreduce_forced_algorithm = 0;
|
||||
static int coll_tuned_allreduce_segment_size = 0;
|
||||
static int coll_tuned_allreduce_tree_fanout;
|
||||
static int coll_tuned_allreduce_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_allreduce_forced_algorithm */
|
||||
static mca_base_var_enum_value_t allreduce_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "nonoverlapping"},
|
||||
{3, "recursive_doubling"},
|
||||
{4, "ring"},
|
||||
{5, "segmented_ring"},
|
||||
{0, NULL}
|
||||
};
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allreduce_intra_nonoverlapping
|
||||
* ompi_coll_base_allreduce_intra_nonoverlapping
|
||||
*
|
||||
* This function just calls a reduce followed by a broadcast
|
||||
* both called functions are tuned but they complete sequentially,
|
||||
* both called functions are base but they complete sequentially,
|
||||
* i.e. no additional overlapping
|
||||
* meaning if the number of segments used is greater than the topo depth
|
||||
* meaning if the number of segments used is greater than the topo depth
|
||||
* then once the first segment of data is fully 'reduced' it is not broadcast
|
||||
* while the reduce continues (cost = cost-reduce + cost-bcast + decision x 3)
|
||||
*
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count,
|
||||
ompi_coll_base_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
@ -75,16 +57,16 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_nonoverlapping rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_nonoverlapping rank %d", rank));
|
||||
|
||||
/* Reduce to 0 and broadcast. */
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
if (0 == rank) {
|
||||
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, rbuf, count, dtype,
|
||||
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, rbuf, count, dtype,
|
||||
op, 0, comm, comm->c_coll.coll_reduce_module);
|
||||
} else {
|
||||
err = comm->c_coll.coll_reduce (rbuf, NULL, count, dtype, op, 0,
|
||||
err = comm->c_coll.coll_reduce (rbuf, NULL, count, dtype, op, 0,
|
||||
comm, comm->c_coll.coll_reduce_module);
|
||||
}
|
||||
} else {
|
||||
@ -100,21 +82,21 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
|
||||
}
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allreduce_intra_recursivedoubling
|
||||
* ompi_coll_base_allreduce_intra_recursivedoubling
|
||||
*
|
||||
* Function: Recursive doubling algorithm for allreduce operation
|
||||
* Accepts: Same as MPI_Allreduce()
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
*
|
||||
* Description: Implements recursive doubling algorithm for allreduce.
|
||||
* Original (non-segmented) implementation is used in MPICH-2
|
||||
* Description: Implements recursive doubling algorithm for allreduce.
|
||||
* Original (non-segmented) implementation is used in MPICH-2
|
||||
* for small and intermediate size messages.
|
||||
* The algorithm preserves order of operations so it can
|
||||
* The algorithm preserves order of operations so it can
|
||||
* be used both by commutative and non-commutative operations.
|
||||
*
|
||||
* Example on 7 nodes:
|
||||
* Initial state
|
||||
* # 0 1 2 3 4 5 6
|
||||
* # 0 1 2 3 4 5 6
|
||||
* [0] [1] [2] [3] [4] [5] [6]
|
||||
* Initial adjustment step for non-power of two nodes.
|
||||
* old rank 1 3 5 6
|
||||
@ -129,24 +111,24 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
|
||||
* old rank 1 3 5 6
|
||||
* new rank 0 1 2 3
|
||||
* [0+1+] [0+1+] [0+1+] [0+1+]
|
||||
* [2+3+] [2+3+] [2+3+] [2+3+]
|
||||
* [2+3+] [2+3+] [2+3+] [2+3+]
|
||||
* [4+5+] [4+5+] [4+5+] [4+5+]
|
||||
* [6 ] [6 ] [6 ] [6 ]
|
||||
* Final adjustment step for non-power of two nodes
|
||||
* # 0 1 2 3 4 5 6
|
||||
* # 0 1 2 3 4 5 6
|
||||
* [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+]
|
||||
* [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+]
|
||||
* [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+]
|
||||
* [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+]
|
||||
* [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ]
|
||||
*
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
int
|
||||
ompi_coll_base_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int ret, line, rank, size, adjsize, remote, distance;
|
||||
int newrank, newremote, extra_ranks;
|
||||
@ -157,9 +139,9 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allreduce_intra_recursivedoubling rank %d", rank));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allreduce_intra_recursivedoubling rank %d", rank));
|
||||
|
||||
/* Special case for size == 1 */
|
||||
if (1 == size) {
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
@ -194,16 +176,16 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
adjsize >>= 1;
|
||||
|
||||
/* Handle non-power-of-two case:
|
||||
- Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
|
||||
- Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
|
||||
sets new rank to -1.
|
||||
- Odd ranks less than 2 * extra_ranks receive data from (rank - 1),
|
||||
- Odd ranks less than 2 * extra_ranks receive data from (rank - 1),
|
||||
apply appropriate operation, and set new rank to rank/2
|
||||
- Everyone else sets rank to rank - extra_ranks
|
||||
*/
|
||||
extra_ranks = size - adjsize;
|
||||
if (rank < (2 * extra_ranks)) {
|
||||
if (0 == (rank % 2)) {
|
||||
ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
|
||||
ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
@ -221,7 +203,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
newrank = rank - extra_ranks;
|
||||
}
|
||||
|
||||
/* Communication/Computation loop
|
||||
/* Communication/Computation loop
|
||||
- Exchange message with remote node.
|
||||
- Perform appropriate operation taking in account order of operations:
|
||||
result = value (op) result
|
||||
@ -230,14 +212,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
if (newrank < 0) break;
|
||||
/* Determine remote node */
|
||||
newremote = newrank ^ distance;
|
||||
remote = (newremote < extra_ranks)?
|
||||
remote = (newremote < extra_ranks)?
|
||||
(newremote * 2 + 1):(newremote + extra_ranks);
|
||||
|
||||
/* Exchange the data */
|
||||
ret = MCA_PML_CALL(irecv(tmprecv, count, dtype, remote,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[0]));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote,
|
||||
ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1]));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
@ -258,14 +240,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
}
|
||||
|
||||
/* Handle non-power-of-two case:
|
||||
- Odd ranks less than 2 * extra_ranks send result from tmpsend to
|
||||
- Odd ranks less than 2 * extra_ranks send result from tmpsend to
|
||||
(rank - 1)
|
||||
- Even ranks less than 2 * extra_ranks receive result from (rank + 1)
|
||||
*/
|
||||
if (rank < (2 * extra_ranks)) {
|
||||
if (0 == (rank % 2)) {
|
||||
ret = MCA_PML_CALL(recv(rbuf, count, dtype, (rank + 1),
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
tmpsend = (char*)rbuf;
|
||||
@ -287,14 +269,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
return MPI_SUCCESS;
|
||||
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
__FILE__, line, rank, ret));
|
||||
if (NULL != inplacebuf) free(inplacebuf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allreduce_intra_ring
|
||||
* ompi_coll_base_allreduce_intra_ring
|
||||
*
|
||||
* Function: Ring algorithm for allreduce operation
|
||||
* Accepts: Same as MPI_Allreduce()
|
||||
@ -304,9 +286,9 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
* automatically segmented to segment of size M/N.
|
||||
* Algorithm requires 2*N - 1 steps.
|
||||
*
|
||||
* Limitations: The algorithm DOES NOT preserve order of operations so it
|
||||
* Limitations: The algorithm DOES NOT preserve order of operations so it
|
||||
* can be used only for commutative operations.
|
||||
* In addition, algorithm cannot work if the total count is
|
||||
* In addition, algorithm cannot work if the total count is
|
||||
* less than size.
|
||||
* Example on 5 nodes:
|
||||
* Initial state
|
||||
@ -318,7 +300,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
* [04] [14] [24] [34] [44]
|
||||
*
|
||||
* COMPUTATION PHASE
|
||||
* Step 0: rank r sends block r to rank (r+1) and receives bloc (r-1)
|
||||
* Step 0: rank r sends block r to rank (r+1) and receives bloc (r-1)
|
||||
* from rank (r-1) [with wraparound].
|
||||
* # 0 1 2 3 4
|
||||
* [00] [00+10] [20] [30] [40]
|
||||
@ -327,7 +309,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
* [03] [13] [23] [33] [33+43]
|
||||
* [44+04] [14] [24] [34] [44]
|
||||
*
|
||||
* Step 1: rank r sends block (r-1) to rank (r+1) and receives bloc
|
||||
* Step 1: rank r sends block (r-1) to rank (r+1) and receives bloc
|
||||
* (r-2) from rank (r-1) [with wraparound].
|
||||
* # 0 1 2 3 4
|
||||
* [00] [00+10] [01+10+20] [30] [40]
|
||||
@ -336,7 +318,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
* [33+43+03] [13] [23] [33] [33+43]
|
||||
* [44+04] [44+04+14] [24] [34] [44]
|
||||
*
|
||||
* Step 2: rank r sends block (r-2) to rank (r+1) and receives bloc
|
||||
* Step 2: rank r sends block (r-2) to rank (r+1) and receives bloc
|
||||
* (r-2) from rank (r-1) [with wraparound].
|
||||
* # 0 1 2 3 4
|
||||
* [00] [00+10] [01+10+20] [01+10+20+30] [40]
|
||||
@ -345,7 +327,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
* [33+43+03] [33+43+03+13] [23] [33] [33+43]
|
||||
* [44+04] [44+04+14] [44+04+14+24] [34] [44]
|
||||
*
|
||||
* Step 3: rank r sends block (r-3) to rank (r+1) and receives bloc
|
||||
* Step 3: rank r sends block (r-3) to rank (r+1) and receives bloc
|
||||
* (r-3) from rank (r-1) [with wraparound].
|
||||
* # 0 1 2 3 4
|
||||
* [00] [00+10] [01+10+20] [01+10+20+30] [FULL]
|
||||
@ -353,16 +335,16 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
|
||||
* [22+32+42+02] [FULL] [22] [22+32] [22+32+42]
|
||||
* [33+43+03] [33+43+03+13] [FULL] [33] [33+43]
|
||||
* [44+04] [44+04+14] [44+04+14+24] [FULL] [44]
|
||||
*
|
||||
*
|
||||
* DISTRIBUTION PHASE: ring ALLGATHER with ranks shifted by 1.
|
||||
*
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
int
|
||||
ompi_coll_base_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int ret, line, rank, size, k, recv_from, send_to, block_count, inbi;
|
||||
int early_segcount, late_segcount, split_rank, max_segcount;
|
||||
@ -375,9 +357,9 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allreduce_intra_ring rank %d, count %d", rank, count));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allreduce_intra_ring rank %d, count %d", rank, count));
|
||||
|
||||
/* Special case for size == 1 */
|
||||
if (1 == size) {
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
@ -389,10 +371,10 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
|
||||
/* Special case for count less than size - use recursive doubling */
|
||||
if (count < size) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count));
|
||||
return (ompi_coll_tuned_allreduce_intra_recursivedoubling(sbuf, rbuf,
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count));
|
||||
return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
|
||||
count,
|
||||
dtype, op,
|
||||
dtype, op,
|
||||
comm, module));
|
||||
}
|
||||
|
||||
@ -404,14 +386,14 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
ret = ompi_datatype_type_size( dtype, &typelng);
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
/* Determine the number of elements per block and corresponding
|
||||
/* Determine the number of elements per block and corresponding
|
||||
block sizes.
|
||||
The blocks are divided into "early" and "late" ones:
|
||||
blocks 0 .. (split_rank - 1) are "early" and
|
||||
blocks 0 .. (split_rank - 1) are "early" and
|
||||
blocks (split_rank) .. (size - 1) are "late".
|
||||
Early blocks are at most 1 element larger than the late ones.
|
||||
*/
|
||||
COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank,
|
||||
COLL_BASE_COMPUTE_BLOCKCOUNT( count, size, split_rank,
|
||||
early_segcount, late_segcount );
|
||||
max_segcount = early_segcount;
|
||||
max_real_segsize = true_extent + (max_segcount - 1) * extent;
|
||||
@ -432,7 +414,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
|
||||
/* Computation loop */
|
||||
|
||||
/*
|
||||
/*
|
||||
For each of the remote nodes:
|
||||
- post irecv for block (r-1)
|
||||
- send block (r)
|
||||
@ -456,8 +438,8 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi]));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
/* Send first block (my block) to the neighbor on the right */
|
||||
block_offset = ((rank < split_rank)?
|
||||
((ptrdiff_t)rank * (ptrdiff_t)early_segcount) :
|
||||
block_offset = ((rank < split_rank)?
|
||||
((ptrdiff_t)rank * (ptrdiff_t)early_segcount) :
|
||||
((ptrdiff_t)rank * (ptrdiff_t)late_segcount + split_rank));
|
||||
block_count = ((rank < split_rank)? early_segcount : late_segcount);
|
||||
tmpsend = ((char*)rbuf) + block_offset * extent;
|
||||
@ -465,21 +447,21 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
for (k = 2; k < size; k++) {
|
||||
const int prevblock = (rank + size - k + 1) % size;
|
||||
|
||||
|
||||
inbi = inbi ^ 0x1;
|
||||
|
||||
|
||||
/* Post irecv for the current block */
|
||||
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi]));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
/* Wait on previous block to arrive */
|
||||
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
/* Apply operation on previous block: result goes to rbuf
|
||||
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
|
||||
*/
|
||||
@ -489,7 +471,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
block_count = ((prevblock < split_rank)? early_segcount : late_segcount);
|
||||
tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
|
||||
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
|
||||
|
||||
|
||||
/* send previous block to send_to */
|
||||
ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
@ -501,7 +483,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE);
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
/* Apply operation on the last block (from neighbor (rank + 1)
|
||||
/* Apply operation on the last block (from neighbor (rank + 1)
|
||||
rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
|
||||
recv_from = (rank + 1) % size;
|
||||
block_offset = ((recv_from < split_rank)?
|
||||
@ -510,28 +492,28 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
block_count = ((recv_from < split_rank)? early_segcount : late_segcount);
|
||||
tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
|
||||
ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype);
|
||||
|
||||
|
||||
/* Distribution loop - variation of ring allgather */
|
||||
send_to = (rank + 1) % size;
|
||||
recv_from = (rank + size - 1) % size;
|
||||
for (k = 0; k < size - 1; k++) {
|
||||
const int recv_data_from = (rank + size - k) % size;
|
||||
const int send_data_from = (rank + 1 + size - k) % size;
|
||||
const int send_block_offset =
|
||||
const int send_block_offset =
|
||||
((send_data_from < split_rank)?
|
||||
((ptrdiff_t)send_data_from * early_segcount) :
|
||||
((ptrdiff_t)send_data_from * late_segcount + split_rank));
|
||||
const int recv_block_offset =
|
||||
const int recv_block_offset =
|
||||
((recv_data_from < split_rank)?
|
||||
((ptrdiff_t)recv_data_from * early_segcount) :
|
||||
((ptrdiff_t)recv_data_from * late_segcount + split_rank));
|
||||
block_count = ((send_data_from < split_rank)?
|
||||
block_count = ((send_data_from < split_rank)?
|
||||
early_segcount : late_segcount);
|
||||
|
||||
tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent;
|
||||
tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent;
|
||||
|
||||
ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to,
|
||||
ret = ompi_coll_base_sendrecv(tmpsend, block_count, dtype, send_to,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
tmprecv, max_segcount, dtype, recv_from,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
@ -546,7 +528,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
return MPI_SUCCESS;
|
||||
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
__FILE__, line, rank, ret));
|
||||
if (NULL != inbuf[0]) free(inbuf[0]);
|
||||
if (NULL != inbuf[1]) free(inbuf[1]);
|
||||
@ -554,30 +536,30 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
}
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_allreduce_intra_ring_segmented
|
||||
* ompi_coll_base_allreduce_intra_ring_segmented
|
||||
*
|
||||
* Function: Pipelined ring algorithm for allreduce operation
|
||||
* Accepts: Same as MPI_Allreduce(), segment size
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
*
|
||||
* Description: Implements pipelined ring algorithm for allreduce:
|
||||
* Description: Implements pipelined ring algorithm for allreduce:
|
||||
* user supplies suggested segment size for the pipelining of
|
||||
* reduce operation.
|
||||
* The segment size determines the number of phases, np, for
|
||||
* the algorithm execution.
|
||||
* The message is automatically divided into blocks of
|
||||
* The segment size determines the number of phases, np, for
|
||||
* the algorithm execution.
|
||||
* The message is automatically divided into blocks of
|
||||
* approximately (count / (np * segcount)) elements.
|
||||
* At the end of reduction phase, allgather like step is
|
||||
* At the end of reduction phase, allgather like step is
|
||||
* executed.
|
||||
* Algorithm requires (np + 1)*(N - 1) steps.
|
||||
*
|
||||
* Limitations: The algorithm DOES NOT preserve order of operations so it
|
||||
* Limitations: The algorithm DOES NOT preserve order of operations so it
|
||||
* can be used only for commutative operations.
|
||||
* In addition, algorithm cannot work if the total size is
|
||||
* In addition, algorithm cannot work if the total size is
|
||||
* less than size * segment size.
|
||||
* Example on 3 nodes with 2 phases
|
||||
* Initial state
|
||||
* # 0 1 2
|
||||
* # 0 1 2
|
||||
* [00a] [10a] [20a]
|
||||
* [00b] [10b] [20b]
|
||||
* [01a] [11a] [21a]
|
||||
@ -586,9 +568,9 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
* [02b] [12b] [22b]
|
||||
*
|
||||
* COMPUTATION PHASE 0 (a)
|
||||
* Step 0: rank r sends block ra to rank (r+1) and receives bloc (r-1)a
|
||||
* Step 0: rank r sends block ra to rank (r+1) and receives bloc (r-1)a
|
||||
* from rank (r-1) [with wraparound].
|
||||
* # 0 1 2
|
||||
* # 0 1 2
|
||||
* [00a] [00a+10a] [20a]
|
||||
* [00b] [10b] [20b]
|
||||
* [01a] [11a] [11a+21a]
|
||||
@ -596,20 +578,20 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
* [22a+02a] [12a] [22a]
|
||||
* [02b] [12b] [22b]
|
||||
*
|
||||
* Step 1: rank r sends block (r-1)a to rank (r+1) and receives bloc
|
||||
* Step 1: rank r sends block (r-1)a to rank (r+1) and receives bloc
|
||||
* (r-2)a from rank (r-1) [with wraparound].
|
||||
* # 0 1 2
|
||||
* # 0 1 2
|
||||
* [00a] [00a+10a] [00a+10a+20a]
|
||||
* [00b] [10b] [20b]
|
||||
* [11a+21a+01a] [11a] [11a+21a]
|
||||
* [01b] [11b] [21b]
|
||||
* [22a+02a] [22a+02a+12a] [22a]
|
||||
* [02b] [12b] [22b]
|
||||
* [02b] [12b] [22b]
|
||||
*
|
||||
* COMPUTATION PHASE 1 (b)
|
||||
* Step 0: rank r sends block rb to rank (r+1) and receives bloc (r-1)b
|
||||
* Step 0: rank r sends block rb to rank (r+1) and receives bloc (r-1)b
|
||||
* from rank (r-1) [with wraparound].
|
||||
* # 0 1 2
|
||||
* # 0 1 2
|
||||
* [00a] [00a+10a] [20a]
|
||||
* [00b] [00b+10b] [20b]
|
||||
* [01a] [11a] [11a+21a]
|
||||
@ -617,31 +599,31 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
|
||||
* [22a+02a] [12a] [22a]
|
||||
* [22b+02b] [12b] [22b]
|
||||
*
|
||||
* Step 1: rank r sends block (r-1)b to rank (r+1) and receives bloc
|
||||
* Step 1: rank r sends block (r-1)b to rank (r+1) and receives bloc
|
||||
* (r-2)b from rank (r-1) [with wraparound].
|
||||
* # 0 1 2
|
||||
* # 0 1 2
|
||||
* [00a] [00a+10a] [00a+10a+20a]
|
||||
* [00b] [10b] [0bb+10b+20b]
|
||||
* [11a+21a+01a] [11a] [11a+21a]
|
||||
* [11b+21b+01b] [11b] [21b]
|
||||
* [22a+02a] [22a+02a+12a] [22a]
|
||||
* [02b] [22b+01b+12b] [22b]
|
||||
* [02b] [22b+01b+12b] [22b]
|
||||
*
|
||||
*
|
||||
*
|
||||
* DISTRIBUTION PHASE: ring ALLGATHER with ranks shifted by 1 (same as
|
||||
* in regular ring algorithm.
|
||||
*
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count,
|
||||
int
|
||||
ompi_coll_base_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
uint32_t segsize)
|
||||
uint32_t segsize)
|
||||
{
|
||||
int ret, line, rank, size, k, recv_from, send_to;
|
||||
int early_blockcount, late_blockcount, split_rank;
|
||||
int early_blockcount, late_blockcount, split_rank;
|
||||
int segcount, max_segcount, num_phases, phase, block_count, inbi;
|
||||
size_t typelng;
|
||||
char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
|
||||
@ -652,9 +634,9 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:allreduce_intra_ring_segmented rank %d, count %d", rank, count));
|
||||
|
||||
/* Special case for size == 1 */
|
||||
if (1 == size) {
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
@ -672,34 +654,34 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
|
||||
ret = ompi_datatype_type_size( dtype, &typelng);
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
segcount = count;
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount)
|
||||
COLL_BASE_COMPUTED_SEGCOUNT(segsize, typelng, segcount)
|
||||
|
||||
/* Special case for count less than size * segcount - use regular ring */
|
||||
if (count < (size * segcount)) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count));
|
||||
return (ompi_coll_tuned_allreduce_intra_ring(sbuf, rbuf, count, dtype, op,
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count));
|
||||
return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op,
|
||||
comm, module));
|
||||
}
|
||||
|
||||
/* Determine the number of phases of the algorithm */
|
||||
num_phases = count / (size * segcount);
|
||||
if ((count % (size * segcount) >= size) &&
|
||||
if ((count % (size * segcount) >= size) &&
|
||||
(count % (size * segcount) > ((size * segcount) / 2))) {
|
||||
num_phases++;
|
||||
}
|
||||
|
||||
/* Determine the number of elements per block and corresponding
|
||||
/* Determine the number of elements per block and corresponding
|
||||
block sizes.
|
||||
The blocks are divided into "early" and "late" ones:
|
||||
blocks 0 .. (split_rank - 1) are "early" and
|
||||
blocks 0 .. (split_rank - 1) are "early" and
|
||||
blocks (split_rank) .. (size - 1) are "late".
|
||||
Early blocks are at most 1 element larger than the late ones.
|
||||
Note, these blocks will be split into num_phases segments,
|
||||
out of the largest one will have max_segcount elements.
|
||||
*/
|
||||
COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank,
|
||||
COLL_BASE_COMPUTE_BLOCKCOUNT( count, size, split_rank,
|
||||
early_blockcount, late_blockcount );
|
||||
COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
|
||||
COLL_BASE_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
|
||||
max_segcount, k);
|
||||
max_real_segsize = true_extent + (ptrdiff_t)(max_segcount - 1) * extent;
|
||||
|
||||
@ -722,7 +704,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
|
||||
ptrdiff_t phase_offset;
|
||||
int early_phase_segcount, late_phase_segcount, split_phase, phase_count;
|
||||
|
||||
/*
|
||||
/*
|
||||
For each of the remote nodes:
|
||||
- post irecv for block (r-1)
|
||||
- send block (r)
|
||||
@ -741,7 +723,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
|
||||
*/
|
||||
send_to = (rank + 1) % size;
|
||||
recv_from = (rank + size - 1) % size;
|
||||
|
||||
|
||||
inbi = 0;
|
||||
/* Initialize first receive from the neighbor on the left */
|
||||
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from,
|
||||
@ -750,81 +732,81 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
|
||||
/* Send first block (my block) to the neighbor on the right:
|
||||
- compute my block and phase offset
|
||||
- send data */
|
||||
block_offset = ((rank < split_rank)?
|
||||
((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) :
|
||||
block_offset = ((rank < split_rank)?
|
||||
((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) :
|
||||
((ptrdiff_t)rank * (ptrdiff_t)late_blockcount + split_rank));
|
||||
block_count = ((rank < split_rank)? early_blockcount : late_blockcount);
|
||||
COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
|
||||
COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
|
||||
early_phase_segcount, late_phase_segcount)
|
||||
phase_count = ((phase < split_phase)?
|
||||
(early_phase_segcount) : (late_phase_segcount));
|
||||
phase_offset = ((phase < split_phase)?
|
||||
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
|
||||
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
|
||||
((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
|
||||
tmpsend = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
|
||||
ret = MCA_PML_CALL(send(tmpsend, phase_count, dtype, send_to,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
for (k = 2; k < size; k++) {
|
||||
const int prevblock = (rank + size - k + 1) % size;
|
||||
|
||||
|
||||
inbi = inbi ^ 0x1;
|
||||
|
||||
|
||||
/* Post irecv for the current block */
|
||||
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
|
||||
&reqs[inbi]));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
/* Wait on previous block to arrive */
|
||||
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
/* Apply operation on previous block: result goes to rbuf
|
||||
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
|
||||
*/
|
||||
block_offset = ((prevblock < split_rank)?
|
||||
((ptrdiff_t)prevblock * (ptrdiff_t)early_blockcount) :
|
||||
((ptrdiff_t)prevblock * (ptrdiff_t)late_blockcount + split_rank));
|
||||
block_count = ((prevblock < split_rank)?
|
||||
block_count = ((prevblock < split_rank)?
|
||||
early_blockcount : late_blockcount);
|
||||
COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
|
||||
COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
|
||||
early_phase_segcount, late_phase_segcount)
|
||||
phase_count = ((phase < split_phase)?
|
||||
(early_phase_segcount) : (late_phase_segcount));
|
||||
phase_offset = ((phase < split_phase)?
|
||||
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
|
||||
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
|
||||
((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
|
||||
tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
|
||||
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype);
|
||||
|
||||
|
||||
/* send previous block to send_to */
|
||||
ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
|
||||
|
||||
/* Wait on the last block to arrive */
|
||||
ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE);
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
/* Apply operation on the last block (from neighbor (rank + 1)
|
||||
|
||||
/* Apply operation on the last block (from neighbor (rank + 1)
|
||||
rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
|
||||
recv_from = (rank + 1) % size;
|
||||
block_offset = ((recv_from < split_rank)?
|
||||
((ptrdiff_t)recv_from * (ptrdiff_t)early_blockcount) :
|
||||
((ptrdiff_t)recv_from * (ptrdiff_t)late_blockcount + split_rank));
|
||||
block_count = ((recv_from < split_rank)?
|
||||
block_count = ((recv_from < split_rank)?
|
||||
early_blockcount : late_blockcount);
|
||||
COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
|
||||
COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
|
||||
early_phase_segcount, late_phase_segcount)
|
||||
phase_count = ((phase < split_phase)?
|
||||
(early_phase_segcount) : (late_phase_segcount));
|
||||
phase_offset = ((phase < split_phase)?
|
||||
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
|
||||
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
|
||||
((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
|
||||
tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
|
||||
ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype);
|
||||
@ -836,21 +818,21 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
|
||||
for (k = 0; k < size - 1; k++) {
|
||||
const int recv_data_from = (rank + size - k) % size;
|
||||
const int send_data_from = (rank + 1 + size - k) % size;
|
||||
const int send_block_offset =
|
||||
const int send_block_offset =
|
||||
((send_data_from < split_rank)?
|
||||
((ptrdiff_t)send_data_from * (ptrdiff_t)early_blockcount) :
|
||||
((ptrdiff_t)send_data_from * (ptrdiff_t)late_blockcount + split_rank));
|
||||
const int recv_block_offset =
|
||||
const int recv_block_offset =
|
||||
((recv_data_from < split_rank)?
|
||||
((ptrdiff_t)recv_data_from * (ptrdiff_t)early_blockcount) :
|
||||
((ptrdiff_t)recv_data_from * (ptrdiff_t)late_blockcount + split_rank));
|
||||
block_count = ((send_data_from < split_rank)?
|
||||
block_count = ((send_data_from < split_rank)?
|
||||
early_blockcount : late_blockcount);
|
||||
|
||||
tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent;
|
||||
tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent;
|
||||
|
||||
ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to,
|
||||
ret = ompi_coll_base_sendrecv(tmpsend, block_count, dtype, send_to,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
tmprecv, early_blockcount, dtype, recv_from,
|
||||
MCA_COLL_BASE_TAG_ALLREDUCE,
|
||||
@ -865,7 +847,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
|
||||
return MPI_SUCCESS;
|
||||
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
__FILE__, line, rank, ret));
|
||||
if (NULL != inbuf[0]) free(inbuf[0]);
|
||||
if (NULL != inbuf[1]) free(inbuf[1]);
|
||||
@ -875,8 +857,8 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
|
||||
/*
|
||||
* Linear functions are copied from the BASIC coll module
|
||||
* they do not segment the message and are simple implementations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as tuned/tree based segmenting operations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as base/tree based segmenting operations
|
||||
* and as such may be selected by the decision functions
|
||||
* These are copied into this module due to the way we select modules
|
||||
* in V1. i.e. in V2 we will handle this differently and so will not
|
||||
@ -895,7 +877,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
||||
ompi_coll_base_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
@ -905,158 +887,28 @@ ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_basic_linear rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_basic_linear rank %d", rank));
|
||||
|
||||
/* Reduce to 0 and broadcast. */
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
if (0 == rank) {
|
||||
err = ompi_coll_tuned_reduce_intra_basic_linear (MPI_IN_PLACE, rbuf, count, dtype,
|
||||
err = ompi_coll_base_reduce_intra_basic_linear (MPI_IN_PLACE, rbuf, count, dtype,
|
||||
op, 0, comm, module);
|
||||
} else {
|
||||
err = ompi_coll_tuned_reduce_intra_basic_linear(rbuf, NULL, count, dtype,
|
||||
err = ompi_coll_base_reduce_intra_basic_linear(rbuf, NULL, count, dtype,
|
||||
op, 0, comm, module);
|
||||
}
|
||||
} else {
|
||||
err = ompi_coll_tuned_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
|
||||
err = ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
|
||||
op, 0, comm, module);
|
||||
}
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear(rbuf, count, dtype, 0, comm, module);
|
||||
return ompi_coll_base_bcast_intra_basic_linear(rbuf, count, dtype, 0, comm, module);
|
||||
}
|
||||
|
||||
|
||||
/* copied function (with appropriate renaming) ends here */
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = coll_tuned_allreduce_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_count",
|
||||
"Number of allreduce algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_allreduce_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_allreduce_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_allreduce_algorithms", allreduce_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm",
|
||||
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allreduce_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_allreduce_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allreduce_segment_size);
|
||||
|
||||
coll_tuned_allreduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allreduce_tree_fanout);
|
||||
|
||||
coll_tuned_allreduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allreduce_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d, segment size %d",
|
||||
data->user_forced[ALLREDUCE].algorithm,
|
||||
data->user_forced[ALLREDUCE].segsize));
|
||||
|
||||
switch (data->user_forced[ALLREDUCE].algorithm) {
|
||||
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (3): return ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (4): return ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (5): return ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, count, dtype, op, comm, module, data->user_forced[ALLREDUCE].segsize);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
data->user_forced[ALLREDUCE].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (3): return ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (4): return ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (5): return ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, count, dtype, op, comm, module, segsize);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2012 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -30,37 +30,18 @@
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_util.h"
|
||||
|
||||
/* alltoall algorithm variables */
|
||||
static int coll_tuned_alltoall_algorithm_count = 5;
|
||||
static int coll_tuned_alltoall_forced_algorithm = 0;
|
||||
static int coll_tuned_alltoall_segment_size = 0;
|
||||
static int coll_tuned_alltoall_max_requests;
|
||||
static int coll_tuned_alltoall_tree_fanout;
|
||||
static int coll_tuned_alltoall_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_alltoall_forced_algorithm */
|
||||
static mca_base_var_enum_value_t alltoall_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "linear"},
|
||||
{2, "pairwise"},
|
||||
{3, "modified_bruck"},
|
||||
{4, "linear_sync"},
|
||||
{5, "two_proc"},
|
||||
{0, NULL}
|
||||
};
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */
|
||||
static int
|
||||
mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
int
|
||||
mca_coll_base_alltoall_intra_basic_inplace(void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
|
||||
int i, j, size, rank, err=MPI_SUCCESS;
|
||||
MPI_Request *preq;
|
||||
char *tmp_buffer;
|
||||
@ -91,7 +72,7 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
|
||||
for (i = 0 ; i < size ; ++i) {
|
||||
for (j = i+1 ; j < size ; ++j) {
|
||||
/* Initiate all send/recv to/from others. */
|
||||
preq = tuned_module->tuned_data->mcct_reqs;
|
||||
preq = coll_base_comm_get_reqs(base_module->base_data, size * 2);
|
||||
|
||||
if (i == rank) {
|
||||
/* Copy the data into the temporary buffer */
|
||||
@ -128,11 +109,8 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
|
||||
}
|
||||
|
||||
/* Wait for the requests to complete */
|
||||
err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE);
|
||||
err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
/* Free the requests. */
|
||||
mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2);
|
||||
}
|
||||
}
|
||||
|
||||
@ -145,7 +123,7 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
|
||||
return err;
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
|
||||
int ompi_coll_base_alltoall_intra_pairwise(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -157,22 +135,22 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
|
||||
ptrdiff_t lb, sext, rext;
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoall_intra_pairwise rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:alltoall_intra_pairwise rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
err = ompi_datatype_get_extent (rdtype, &lb, &rext);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
|
||||
|
||||
/* Perform pairwise exchange - starting from 1 so the local copy is last */
|
||||
for (step = 1; step < size + 1; step++) {
|
||||
|
||||
@ -185,51 +163,47 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
|
||||
tmprecv = (char*)rbuf + (ptrdiff_t)recvfrom * rext * (ptrdiff_t)rcount;
|
||||
|
||||
/* send and receive */
|
||||
err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto,
|
||||
err = ompi_coll_base_sendrecv( tmpsend, scount, sdtype, sendto,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
tmprecv, rcount, rdtype, recvfrom,
|
||||
tmprecv, rcount, rdtype, recvfrom,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
return MPI_SUCCESS;
|
||||
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line,
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line,
|
||||
err, rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
int ompi_coll_base_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, k, line = -1, rank, size, err = 0, weallocated = 0;
|
||||
int i, k, line = -1, rank, size, err = 0;
|
||||
int sendto, recvfrom, distance, *displs = NULL, *blen = NULL;
|
||||
char *tmpbuf = NULL, *tmpbuf_free = NULL;
|
||||
ptrdiff_t rlb, slb, tlb, sext, rext, tsext;
|
||||
struct ompi_datatype_t *new_ddt;
|
||||
#ifdef blahblah
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
#endif
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoall_intra_bruck rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:alltoall_intra_bruck rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
@ -241,25 +215,10 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
|
||||
#ifdef blahblah
|
||||
/* try and SAVE memory by using the data segment hung off
|
||||
the communicator if possible */
|
||||
if (data->mcct_num_reqs >= size) {
|
||||
/* we have enought preallocated for displments and lengths */
|
||||
displs = (int*) data->mcct_reqs;
|
||||
blen = (int *) (displs + size);
|
||||
weallocated = 0;
|
||||
}
|
||||
else { /* allocate the buffers ourself */
|
||||
#endif
|
||||
displs = (int *) malloc(size * sizeof(int));
|
||||
if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
|
||||
blen = (int *) malloc(size * sizeof(int));
|
||||
if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }
|
||||
weallocated = 1;
|
||||
#ifdef blahblah
|
||||
}
|
||||
#endif
|
||||
displs = (int *) malloc(size * sizeof(int));
|
||||
if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
|
||||
blen = (int *) malloc(size * sizeof(int));
|
||||
if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }
|
||||
|
||||
/* tmp buffer allocation for message data */
|
||||
tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext);
|
||||
@ -267,9 +226,9 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
tmpbuf = tmpbuf_free - slb;
|
||||
|
||||
/* Step 1 - local rotation - shift up by rank */
|
||||
err = ompi_datatype_copy_content_same_ddt (sdtype,
|
||||
err = ompi_datatype_copy_content_same_ddt (sdtype,
|
||||
(int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount),
|
||||
tmpbuf,
|
||||
tmpbuf,
|
||||
((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext);
|
||||
if (err<0) {
|
||||
line = __LINE__; err = -1; goto err_hndl;
|
||||
@ -277,7 +236,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
|
||||
if (rank != 0) {
|
||||
err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount,
|
||||
tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
|
||||
tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
|
||||
(char*) sbuf);
|
||||
if (err<0) {
|
||||
line = __LINE__; err = -1; goto err_hndl;
|
||||
@ -294,7 +253,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
/* create indexed datatype */
|
||||
for (i = 1; i < size; i++) {
|
||||
if (( i & distance) == distance) {
|
||||
displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount;
|
||||
displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount;
|
||||
blen[k] = scount;
|
||||
k++;
|
||||
}
|
||||
@ -307,7 +266,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv ( tmpbuf, 1, new_ddt, sendto,
|
||||
err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
rbuf, 1, new_ddt, recvfrom,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
@ -327,22 +286,20 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
for (i = 0; i < size; i++) {
|
||||
|
||||
err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount,
|
||||
((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
|
||||
((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
|
||||
tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext);
|
||||
if (err < 0) { line = __LINE__; err = -1; goto err_hndl; }
|
||||
}
|
||||
|
||||
/* Step 4 - clean up */
|
||||
if (tmpbuf != NULL) free(tmpbuf_free);
|
||||
if (weallocated) {
|
||||
if (displs != NULL) free(displs);
|
||||
if (blen != NULL) free(blen);
|
||||
}
|
||||
if (displs != NULL) free(displs);
|
||||
if (blen != NULL) free(blen);
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
|
||||
rank));
|
||||
if (tmpbuf != NULL) free(tmpbuf_free);
|
||||
if (displs != NULL) free(displs);
|
||||
@ -352,10 +309,10 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
|
||||
/*
|
||||
* alltoall_intra_linear_sync
|
||||
*
|
||||
*
|
||||
* Function: Linear implementation of alltoall with limited number
|
||||
* of outstanding requests.
|
||||
* Accepts: Same as MPI_Alltoall(), and the maximum number of
|
||||
* Accepts: Same as MPI_Alltoall(), and the maximum number of
|
||||
* outstanding requests (actual number is 2 * max, since
|
||||
* we count receive and send requests separately).
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
@ -367,7 +324,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
* - wait for any request to complete
|
||||
* - replace that request by the new one of the same type.
|
||||
*/
|
||||
int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
int ompi_coll_base_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -382,7 +339,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
ompi_request_t **reqs = NULL;
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
@ -391,8 +348,8 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_alltoall_intra_linear_sync rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_alltoall_intra_linear_sync rank %d", rank));
|
||||
|
||||
error = ompi_datatype_get_extent(sdtype, &slb, &sext);
|
||||
if (OMPI_SUCCESS != error) {
|
||||
@ -423,18 +380,18 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
}
|
||||
|
||||
/* Initiate send/recv to/from others. */
|
||||
total_reqs = (((max_outstanding_reqs > (size - 1)) ||
|
||||
total_reqs = (((max_outstanding_reqs > (size - 1)) ||
|
||||
(max_outstanding_reqs <= 0)) ?
|
||||
(size - 1) : (max_outstanding_reqs));
|
||||
reqs = (ompi_request_t**) malloc( 2 * total_reqs *
|
||||
reqs = (ompi_request_t**) malloc( 2 * total_reqs *
|
||||
sizeof(ompi_request_t*));
|
||||
if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
prcv = (char *) rbuf;
|
||||
psnd = (char *) sbuf;
|
||||
|
||||
/* Post first batch or ireceive and isend requests */
|
||||
for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
|
||||
for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
|
||||
ri = (ri + 1) % size, ++nreqs, ++nrreqs) {
|
||||
error =
|
||||
MCA_PML_CALL(irecv
|
||||
@ -442,7 +399,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs]));
|
||||
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
|
||||
for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
|
||||
si = (si + size - 1) % size, ++nreqs, ++nsreqs) {
|
||||
error =
|
||||
MCA_PML_CALL(isend
|
||||
@ -457,12 +414,12 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
/* Optimization for the case when all requests have been posted */
|
||||
error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
|
||||
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
} else {
|
||||
/* As requests complete, replace them with corresponding requests:
|
||||
- wait for any request to complete, mark the request as
|
||||
- wait for any request to complete, mark the request as
|
||||
MPI_REQUEST_NULL
|
||||
- If it was a receive request, replace it with new irecv request
|
||||
- If it was a receive request, replace it with new irecv request
|
||||
(if any)
|
||||
- if it was a send request, replace it with new isend request (if any)
|
||||
*/
|
||||
@ -476,10 +433,10 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
ncreqs++;
|
||||
if (completed < total_reqs) {
|
||||
if (nrreqs < (size - 1)) {
|
||||
error =
|
||||
error =
|
||||
MCA_PML_CALL(irecv
|
||||
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL, comm,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL, comm,
|
||||
&reqs[completed]));
|
||||
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
||||
++nrreqs;
|
||||
@ -493,7 +450,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&reqs[completed]));
|
||||
++nsreqs;
|
||||
si = (si + size - 1) % size;
|
||||
si = (si + size - 1) % size;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -506,15 +463,15 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
||||
return MPI_SUCCESS;
|
||||
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
|
||||
rank));
|
||||
if (NULL != reqs) free(reqs);
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
int ompi_coll_base_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -526,14 +483,14 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
ptrdiff_t sext, rext, lb;
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_alltoall_intra_two_procs rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_alltoall_intra_two_procs rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
@ -548,17 +505,17 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
tmprecv = (char*)rbuf + (ptrdiff_t)remote * rext * (ptrdiff_t)rcount;
|
||||
|
||||
/* send and receive */
|
||||
err = ompi_coll_tuned_sendrecv ( tmpsend, scount, sdtype, remote,
|
||||
err = ompi_coll_base_sendrecv ( tmpsend, scount, sdtype, remote,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
tmprecv, rcount, rdtype, remote,
|
||||
tmprecv, rcount, rdtype, remote,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
comm, MPI_STATUS_IGNORE, rank );
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* ddt sendrecv your own data */
|
||||
err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount,
|
||||
(int32_t) scount, sdtype,
|
||||
(char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount,
|
||||
err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount,
|
||||
(int32_t) scount, sdtype,
|
||||
(char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount,
|
||||
(int32_t) rcount, rdtype);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
@ -566,7 +523,7 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
|
||||
rank));
|
||||
return err;
|
||||
@ -577,8 +534,8 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
/*
|
||||
* Linear functions are copied from the BASIC coll module
|
||||
* they do not segment the message and are simple implementations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as tuned/tree based segmenting operations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as base/tree based segmenting operations
|
||||
* and as such may be selected by the decision functions
|
||||
* These are copied into this module due to the way we select modules
|
||||
* in V1. i.e. in V2 we will handle this differently and so will not
|
||||
@ -588,22 +545,22 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
|
||||
/* copied function (with appropriate renaming) starts here */
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
int ompi_coll_base_alltoall_intra_basic_linear(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, rank, size, err, nreqs;
|
||||
char *psnd, *prcv;
|
||||
MPI_Aint lb, sndinc, rcvinc;
|
||||
ompi_request_t **req, **sreq, **rreq;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
|
||||
mca_coll_base_comm_t *data = base_module->base_data;
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
@ -612,9 +569,8 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_alltoall_intra_basic_linear rank %d", rank));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_alltoall_intra_basic_linear rank %d", rank));
|
||||
|
||||
err = ompi_datatype_get_extent(sdtype, &lb, &sndinc);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
@ -646,44 +602,41 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
|
||||
|
||||
/* Initiate all send/recv to/from others. */
|
||||
|
||||
req = rreq = data->mcct_reqs;
|
||||
sreq = rreq + size - 1;
|
||||
req = rreq = coll_base_comm_get_reqs(data, (size - 1) * 2);
|
||||
|
||||
prcv = (char *) rbuf;
|
||||
psnd = (char *) sbuf;
|
||||
|
||||
/* Post all receives first -- a simple optimization */
|
||||
|
||||
for (nreqs = 0, i = (rank + 1) % size; i != rank;
|
||||
for (nreqs = 0, i = (rank + 1) % size; i != rank;
|
||||
i = (i + 1) % size, ++rreq, ++nreqs) {
|
||||
err =
|
||||
MCA_PML_CALL(irecv_init
|
||||
(prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
|
||||
err = MCA_PML_CALL(irecv_init
|
||||
(prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
|
||||
if (MPI_SUCCESS != err) {
|
||||
ompi_coll_tuned_free_reqs(req, rreq - req);
|
||||
ompi_coll_base_free_reqs(req, nreqs);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now post all sends in reverse order
|
||||
/* Now post all sends in reverse order
|
||||
- We would like to minimize the search time through message queue
|
||||
when messages actually arrive in the order in which they were posted.
|
||||
*/
|
||||
for (nreqs = 0, i = (rank + size - 1) % size; i != rank;
|
||||
sreq = rreq;
|
||||
for (i = (rank + size - 1) % size; i != rank;
|
||||
i = (i + size - 1) % size, ++sreq, ++nreqs) {
|
||||
err =
|
||||
MCA_PML_CALL(isend_init
|
||||
(psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
|
||||
err = MCA_PML_CALL(isend_init
|
||||
(psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
|
||||
if (MPI_SUCCESS != err) {
|
||||
ompi_coll_tuned_free_reqs(req, sreq - req);
|
||||
ompi_coll_base_free_reqs(req, nreqs);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
nreqs = (size - 1) * 2;
|
||||
/* Start your engines. This will never return an error. */
|
||||
|
||||
MCA_PML_CALL(start(nreqs, req));
|
||||
@ -698,165 +651,10 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
|
||||
err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE);
|
||||
|
||||
/* Free the reqs */
|
||||
|
||||
ompi_coll_tuned_free_reqs(req, nreqs);
|
||||
ompi_coll_base_free_reqs(req, nreqs);
|
||||
|
||||
/* All done */
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/* copied function (with appropriate renaming) ends here */
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t*new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = coll_tuned_alltoall_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_count",
|
||||
"Number of alltoall algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_alltoall_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_alltoall_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_alltoall_algorithms", alltoall_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm",
|
||||
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: linear with sync, 5:two proc only.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_alltoall_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_segment_size);
|
||||
|
||||
coll_tuned_alltoall_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_tree_fanout);
|
||||
|
||||
coll_tuned_alltoall_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_chain_fanout",
|
||||
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_chain_fanout);
|
||||
|
||||
coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
|
||||
mca_param_indices->max_requests_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_max_requests",
|
||||
"Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_max_requests);
|
||||
if (mca_param_indices->max_requests_param_index < 0) {
|
||||
return mca_param_indices->max_requests_param_index;
|
||||
}
|
||||
|
||||
if (coll_tuned_alltoall_max_requests < 0) {
|
||||
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
||||
opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to system level default %d \n",
|
||||
ompi_coll_tuned_init_max_requests );
|
||||
}
|
||||
coll_tuned_alltoall_max_requests = 0;
|
||||
}
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d",
|
||||
data->user_forced[ALLTOALL].algorithm));
|
||||
|
||||
switch (data->user_forced[ALLTOALL].algorithm) {
|
||||
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (3): return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (4): return ompi_coll_tuned_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, data->user_forced[ALLTOALL].max_requests);
|
||||
case (5): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
data->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize,
|
||||
int max_requests)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (3): return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (4): return ompi_coll_tuned_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, max_requests);
|
||||
case (5): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2012 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -32,29 +32,17 @@
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_util.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
/* alltoallv algorithm variables */
|
||||
static int coll_tuned_alltoallv_algorithm_count = 2;
|
||||
static int coll_tuned_alltoallv_forced_algorithm = 0;
|
||||
|
||||
/* valid values for coll_tuned_alltoallv_forced_algorithm */
|
||||
static mca_base_var_enum_value_t alltoallv_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "pairwise"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
static int
|
||||
mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
|
||||
int
|
||||
mca_coll_base_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
|
||||
int i, j, size, rank, err=MPI_SUCCESS;
|
||||
MPI_Request *preq;
|
||||
char *tmp_buffer;
|
||||
@ -90,7 +78,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con
|
||||
for (i = 0 ; i < size ; ++i) {
|
||||
for (j = i+1 ; j < size ; ++j) {
|
||||
/* Initiate all send/recv to/from others. */
|
||||
preq = tuned_module->tuned_data->mcct_reqs;
|
||||
preq = coll_base_comm_get_reqs(base_module->base_data, 2);
|
||||
|
||||
if (i == rank && rcounts[j]) {
|
||||
/* Copy the data into the temporary buffer */
|
||||
@ -127,11 +115,8 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con
|
||||
}
|
||||
|
||||
/* Wait for the requests to complete */
|
||||
err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE);
|
||||
err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
/* Free the requests. */
|
||||
mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2);
|
||||
}
|
||||
}
|
||||
|
||||
@ -145,7 +130,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
|
||||
ompi_coll_base_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -157,15 +142,15 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
|
||||
ptrdiff_t sext, rext;
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
|
||||
return mca_coll_base_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
|
||||
rdtype, comm, module);
|
||||
}
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoallv_intra_pairwise rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:alltoallv_intra_pairwise rank %d", rank));
|
||||
|
||||
ompi_datatype_type_extent(sdtype, &sext);
|
||||
ompi_datatype_type_extent(rdtype, &rext);
|
||||
@ -182,34 +167,33 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
|
||||
prcv = (char*)rbuf + (ptrdiff_t)rdisps[recvfrom] * rext;
|
||||
|
||||
/* send and receive */
|
||||
err = ompi_coll_tuned_sendrecv( psnd, scounts[sendto], sdtype, sendto,
|
||||
err = ompi_coll_base_sendrecv( psnd, scounts[sendto], sdtype, sendto,
|
||||
MCA_COLL_BASE_TAG_ALLTOALLV,
|
||||
prcv, rcounts[recvfrom], rdtype, recvfrom,
|
||||
prcv, rcounts[recvfrom], rdtype, recvfrom,
|
||||
MCA_COLL_BASE_TAG_ALLTOALLV,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
return MPI_SUCCESS;
|
||||
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"%s:%4d\tError occurred %d, rank %2d at step %d", __FILE__, line,
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"%s:%4d\tError occurred %d, rank %2d at step %d", __FILE__, line,
|
||||
err, rank, step));
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* Linear functions are copied from the basic coll module. For
|
||||
* some small number of nodes and/or small data sizes they are just as
|
||||
* fast as tuned/tree based segmenting operations and as such may be
|
||||
* fast as base/tree based segmenting operations and as such may be
|
||||
* selected by the decision functions. These are copied into this module
|
||||
* due to the way we select modules in V1. i.e. in V2 we will handle this
|
||||
* differently and so will not have to duplicate code.
|
||||
* GEF Oct05 after asking Jeff.
|
||||
* differently and so will not have to duplicate code.
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps,
|
||||
ompi_coll_base_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -220,19 +204,19 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
|
||||
char *psnd, *prcv;
|
||||
ptrdiff_t sext, rext;
|
||||
MPI_Request *preq;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
|
||||
mca_coll_base_comm_t *data = base_module->base_data;
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
|
||||
return mca_coll_base_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
|
||||
rdtype, comm, module);
|
||||
}
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoallv_intra_basic_linear rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:alltoallv_intra_basic_linear rank %d", rank));
|
||||
|
||||
ompi_datatype_type_extent(sdtype, &sext);
|
||||
ompi_datatype_type_extent(rdtype, &rext);
|
||||
@ -255,7 +239,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
|
||||
|
||||
/* Now, initiate all send/recv to/from others. */
|
||||
nreqs = 0;
|
||||
preq = data->mcct_reqs;
|
||||
preq = coll_base_comm_get_reqs(data, 2 * size);
|
||||
|
||||
/* Post all receives first */
|
||||
for (i = 0; i < size; ++i) {
|
||||
@ -269,7 +253,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
|
||||
preq++));
|
||||
++nreqs;
|
||||
if (MPI_SUCCESS != err) {
|
||||
ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
|
||||
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
@ -287,7 +271,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
|
||||
preq++));
|
||||
++nreqs;
|
||||
if (MPI_SUCCESS != err) {
|
||||
ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
|
||||
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
@ -305,128 +289,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
|
||||
MPI_STATUSES_IGNORE);
|
||||
|
||||
/* Free the requests. */
|
||||
ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
|
||||
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* The following are used by dynamic and forced rules. Publish
|
||||
* details of each algorithm and if its forced/fixed/locked in as you add
|
||||
* methods/algorithms you must update this and the query/map routines.
|
||||
* This routine is called by the component only. This makes sure that
|
||||
* the mca parameters are set to their initial values and perms.
|
||||
* Module does not call this. They call the forced_getvalues routine
|
||||
* instead.
|
||||
*/
|
||||
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t
|
||||
*mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV] = coll_tuned_alltoallv_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoallv_algorithm_count",
|
||||
"Number of alltoallv algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_alltoallv_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_alltoallv_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_alltoallv_algorithms", alltoallv_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoallv_algorithm",
|
||||
"Which alltoallv algorithm is used. "
|
||||
"Can be locked down to choice of: 0 ignore, "
|
||||
"1 basic linear, 2 pairwise.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoallv_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoallv_intra_do_forced(void *sbuf, int *scounts, int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoallv_intra_do_forced selected algorithm %d",
|
||||
data->user_forced[ALLTOALLV].algorithm));
|
||||
|
||||
switch (data->user_forced[ALLTOALLV].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoallv_intra_do_forced attempt to "
|
||||
"select algorithm %d when only 0-%d is valid.",
|
||||
data->user_forced[ALLTOALLV].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
}
|
||||
|
||||
/* If the user selects dynamic rules and specifies the algorithm to
|
||||
* use, then this function is called. */
|
||||
int ompi_coll_tuned_alltoallv_intra_do_this(void *sbuf, int *scounts, int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoallv_intra_do_this selected algorithm %d ",
|
||||
algorithm));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoall_intra_do_this attempt to select "
|
||||
"algorithm %d when only 0-%d is valid.",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
}
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -31,25 +31,9 @@
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_util.h"
|
||||
|
||||
/* barrier algorithm variables */
|
||||
static int coll_tuned_barrier_algorithm_count = 6;
|
||||
static int coll_tuned_barrier_forced_algorithm = 0;
|
||||
|
||||
/* valid values for coll_tuned_barrier_forced_algorithm */
|
||||
static mca_base_var_enum_value_t barrier_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "linear"},
|
||||
{2, "double_ring"},
|
||||
{3, "recursive_doubling"},
|
||||
{4, "bruck"},
|
||||
{5, "two_proc"},
|
||||
{6, "tree"},
|
||||
{0, NULL}
|
||||
};
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
/**
|
||||
* A quick version of the MPI_Sendreceive implemented for the barrier.
|
||||
@ -57,7 +41,7 @@ static mca_base_var_enum_value_t barrier_algorithms[] = {
|
||||
* signal a two peer synchronization.
|
||||
*/
|
||||
static inline int
|
||||
ompi_coll_tuned_sendrecv_zero(int dest, int stag,
|
||||
ompi_coll_base_sendrecv_zero(int dest, int stag,
|
||||
int source, int rtag,
|
||||
MPI_Comm comm)
|
||||
|
||||
@ -87,8 +71,8 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag,
|
||||
err_index = 1;
|
||||
}
|
||||
err = statuses[err_index].MPI_ERROR;
|
||||
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s"
|
||||
" stage of ompi_coll_tuned_sendrecv_zero\n",
|
||||
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
|
||||
" stage of ompi_coll_base_sendrecv_zero\n",
|
||||
__FILE__, line, err, (0 == err_index ? "receive" : "send")));
|
||||
return err;
|
||||
}
|
||||
@ -100,21 +84,21 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag,
|
||||
/* Error discovered during the posting of the irecv or isend,
|
||||
* and no status is available.
|
||||
*/
|
||||
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n",
|
||||
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
|
||||
__FILE__, line, err));
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Barrier is ment to be a synchronous operation, as some BTLs can mark
|
||||
* a request done before its passed to the NIC and progress might not be made
|
||||
* elsewhere we cannot allow a process to exit the barrier until its last
|
||||
* Barrier is ment to be a synchronous operation, as some BTLs can mark
|
||||
* a request done before its passed to the NIC and progress might not be made
|
||||
* elsewhere we cannot allow a process to exit the barrier until its last
|
||||
* [round of] sends are completed.
|
||||
*
|
||||
* It is last round of sends rather than 'last' individual send as each pair of
|
||||
* peers can use different channels/devices/btls and the receiver of one of
|
||||
* It is last round of sends rather than 'last' individual send as each pair of
|
||||
* peers can use different channels/devices/btls and the receiver of one of
|
||||
* these sends might be forced to wait as the sender
|
||||
* leaves the collective and does not make progress until the next mpi call
|
||||
* leaves the collective and does not make progress until the next mpi call
|
||||
*
|
||||
*/
|
||||
|
||||
@ -124,7 +108,7 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag,
|
||||
* synchronous gurantee made by last ring of sends are synchronous
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
|
||||
int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int rank, size, err = 0, line = 0, left, right;
|
||||
@ -132,50 +116,50 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_barrier_intra_doublering rank %d", rank));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));
|
||||
|
||||
left = ((rank-1)%size);
|
||||
right = ((rank+1)%size);
|
||||
|
||||
if (rank > 0) { /* receive message from the left */
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
/* Send message to the right */
|
||||
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* root needs to receive from the last node */
|
||||
if (rank == 0) {
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
/* Allow nodes to exit */
|
||||
if (rank > 0) { /* post Receive from left */
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
/* send message to the right one */
|
||||
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
|
||||
/* rank 0 post receive from the last node */
|
||||
if (rank == 0) {
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
@ -183,7 +167,7 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
@ -193,15 +177,15 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
|
||||
* To make synchronous, uses sync sends and sync sendrecvs
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
|
||||
int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int rank, size, adjsize, err, line, mask, remote;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_barrier_intra_recursivedoubling rank %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_barrier_intra_recursivedoubling rank %d",
|
||||
rank));
|
||||
|
||||
/* do nearest power of 2 less than size calc */
|
||||
@ -213,7 +197,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
if (rank >= adjsize) {
|
||||
/* send message to lower ranked node */
|
||||
remote = rank - adjsize;
|
||||
err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
@ -222,7 +206,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
|
||||
/* receive message from high level rank */
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
@ -238,7 +222,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
if (remote >= adjsize) continue;
|
||||
|
||||
/* post receive from the remote node */
|
||||
err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
@ -250,8 +234,8 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
if (rank < (size - adjsize)) {
|
||||
/* send enter message to higher ranked node */
|
||||
remote = rank + adjsize;
|
||||
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
|
||||
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
@ -261,7 +245,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
@ -271,23 +255,23 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
* To make synchronous, uses sync sends and sync sendrecvs
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
|
||||
int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int rank, size, distance, to, from, err, line = 0;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_barrier_intra_bruck rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_barrier_intra_bruck rank %d", rank));
|
||||
|
||||
/* exchange data with rank-2^k and rank+2^k */
|
||||
for (distance = 1; distance < size; distance <<= 1) {
|
||||
for (distance = 1; distance < size; distance <<= 1) {
|
||||
from = (rank + size - distance) % size;
|
||||
to = (rank + distance) % size;
|
||||
|
||||
/* send message to lower ranked node */
|
||||
err = ompi_coll_tuned_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
|
||||
err = ompi_coll_base_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
|
||||
from, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
@ -296,7 +280,7 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
@ -306,17 +290,17 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
|
||||
* To make synchronous, uses sync sends and sync sendrecvs
|
||||
*/
|
||||
/* special case for two processes */
|
||||
int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
|
||||
int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int remote, err;
|
||||
|
||||
remote = ompi_comm_rank(comm);
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_barrier_intra_two_procs rank %d", remote));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_barrier_intra_two_procs rank %d", remote));
|
||||
remote = (remote + 1) & 0x1;
|
||||
|
||||
err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm);
|
||||
return (err);
|
||||
@ -327,7 +311,7 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
|
||||
* Linear functions are copied from the BASIC coll module
|
||||
* they do not segment the message and are simple implementations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as tuned/tree based segmenting operations
|
||||
* are just as fast as base/tree based segmenting operations
|
||||
* and as such may be selected by the decision functions
|
||||
* These are copied into this module due to the way we select modules
|
||||
* in V1. i.e. in V2 we will handle this differently and so will not
|
||||
@ -337,8 +321,8 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
|
||||
|
||||
/* copied function (with appropriate renaming) starts here */
|
||||
|
||||
static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, err, rank, size;
|
||||
|
||||
@ -347,14 +331,14 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
|
||||
|
||||
/* All non-root send & receive zero-length message. */
|
||||
if (rank > 0) {
|
||||
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
|
||||
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
|
||||
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err) {
|
||||
@ -370,7 +354,7 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
|
||||
requests = (ompi_request_t**)malloc( size * sizeof(ompi_request_t*) );
|
||||
for (i = 1; i < size; ++i) {
|
||||
err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
&(requests[i])));
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
@ -379,15 +363,14 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
|
||||
ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );
|
||||
|
||||
for (i = 1; i < size; ++i) {
|
||||
err = MCA_PML_CALL(isend(NULL, 0, MPI_BYTE, i,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&(requests[i])));
|
||||
err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, i,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );
|
||||
|
||||
free( requests );
|
||||
}
|
||||
|
||||
@ -400,17 +383,17 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
|
||||
|
||||
/*
|
||||
* Another recursive doubling type algorithm, but in this case
|
||||
* we go up the tree and back down the tree.
|
||||
* we go up the tree and back down the tree.
|
||||
*/
|
||||
int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
|
||||
int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int rank, size, depth, err, jump, partner;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_barrier_intra_tree %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_barrier_intra_tree %d",
|
||||
rank));
|
||||
|
||||
/* Find the nearest power of 2 of the communicator size. */
|
||||
@ -420,21 +403,21 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
|
||||
partner = rank ^ jump;
|
||||
if (!(partner & (jump-1)) && partner < size) {
|
||||
if (partner > rank) {
|
||||
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
|
||||
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err)
|
||||
return err;
|
||||
} else if (partner < rank) {
|
||||
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err)
|
||||
return err;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
depth >>= 1;
|
||||
for (jump = depth; jump>0; jump>>=1) {
|
||||
partner = rank ^ jump;
|
||||
@ -446,7 +429,7 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
|
||||
if (MPI_SUCCESS != err)
|
||||
return err;
|
||||
} else if (partner < rank) {
|
||||
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
|
||||
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err)
|
||||
@ -457,101 +440,3 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
|
||||
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map */
|
||||
/* routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values */
|
||||
/* and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[BARRIER] = coll_tuned_barrier_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm_count",
|
||||
"Number of barrier algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_barrier_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_barrier_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_barrier_algorithms", barrier_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm",
|
||||
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_barrier_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:barrier_intra_do_forced selected algorithm %d",
|
||||
data->user_forced[BARRIER].algorithm));
|
||||
|
||||
switch (data->user_forced[BARRIER].algorithm) {
|
||||
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module);
|
||||
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm, module);
|
||||
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm, module);
|
||||
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm, module);
|
||||
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm, module);
|
||||
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm, module);
|
||||
case (6): return ompi_coll_tuned_barrier_intra_tree (comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
data->user_forced[BARRIER].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module);
|
||||
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm, module);
|
||||
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm, module);
|
||||
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm, module);
|
||||
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm, module);
|
||||
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm, module);
|
||||
case (6): return ompi_coll_tuned_barrier_intra_tree (comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
}
|
||||
|
@ -3,18 +3,18 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2012 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -27,33 +27,14 @@
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_util.h"
|
||||
|
||||
/* bcast algorithm variables */
|
||||
static int coll_tuned_bcast_algorithm_count = 6;
|
||||
static int coll_tuned_bcast_forced_algorithm = 0;
|
||||
static int coll_tuned_bcast_segment_size = 0;
|
||||
static int coll_tuned_bcast_tree_fanout;
|
||||
static int coll_tuned_bcast_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_bcast_forced_algorithm */
|
||||
static mca_base_var_enum_value_t bcast_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "chain"},
|
||||
{3, "pipeline"},
|
||||
{4, "split_binary_tree"},
|
||||
{5, "binary_tree"},
|
||||
{6, "binomial"},
|
||||
{0, NULL}
|
||||
};
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
||||
int original_count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
ompi_coll_base_bcast_intra_generic( void* buffer,
|
||||
int original_count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
mca_coll_base_module_t *module,
|
||||
@ -62,12 +43,12 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
||||
{
|
||||
int err = 0, line, i, rank, size, segindex, req_index;
|
||||
int num_segments; /* Number of segments */
|
||||
int sendcount; /* number of elements sent in this segment */
|
||||
int sendcount; /* number of elements sent in this segment */
|
||||
size_t realsegsize, type_size;
|
||||
char *tmpbuf;
|
||||
ptrdiff_t extent, lb;
|
||||
ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
||||
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
ompi_request_t **send_reqs = NULL;
|
||||
#endif
|
||||
|
||||
@ -79,20 +60,20 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
||||
ompi_datatype_type_size( datatype, &type_size );
|
||||
num_segments = (original_count + count_by_segment - 1) / count_by_segment;
|
||||
realsegsize = (ptrdiff_t)count_by_segment * extent;
|
||||
|
||||
|
||||
/* Set the buffer pointers */
|
||||
tmpbuf = (char *) buffer;
|
||||
|
||||
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
if( tree->tree_nextsize != 0 ) {
|
||||
send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize *
|
||||
send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize *
|
||||
sizeof(ompi_request_t*) );
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Root code */
|
||||
if( rank == root ) {
|
||||
/*
|
||||
/*
|
||||
For each segment:
|
||||
- send segment to all children.
|
||||
The last segment may have less elements than other segments.
|
||||
@ -102,39 +83,39 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
||||
if( segindex == (num_segments - 1) ) {
|
||||
sendcount = original_count - segindex * count_by_segment;
|
||||
}
|
||||
for( i = 0; i < tree->tree_nextsize; i++ ) {
|
||||
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
for( i = 0; i < tree->tree_nextsize; i++ ) {
|
||||
#if defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype,
|
||||
tree->tree_next[i],
|
||||
tree->tree_next[i],
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
#else
|
||||
err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
|
||||
tree->tree_next[i],
|
||||
tree->tree_next[i],
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&send_reqs[i]));
|
||||
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
|
||||
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
/* complete the sends before starting the next sends */
|
||||
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
|
||||
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
|
||||
MPI_STATUSES_IGNORE );
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
#endif /* not COLL_TUNED_BCAST_USE_BLOCKING */
|
||||
#endif /* not COLL_BASE_BCAST_USE_BLOCKING */
|
||||
|
||||
/* update tmp buffer */
|
||||
tmpbuf += realsegsize;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Intermediate nodes code */
|
||||
else if( tree->tree_nextsize > 0 ) {
|
||||
/*
|
||||
Create the pipeline.
|
||||
else if( tree->tree_nextsize > 0 ) {
|
||||
/*
|
||||
Create the pipeline.
|
||||
1) Post the first receive
|
||||
2) For segments 1 .. num_segments
|
||||
- post new receive
|
||||
@ -149,49 +130,49 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
||||
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, &recv_reqs[req_index]));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
for( segindex = 1; segindex < num_segments; segindex++ ) {
|
||||
|
||||
|
||||
req_index = req_index ^ 0x1;
|
||||
|
||||
|
||||
/* post new irecv */
|
||||
err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, count_by_segment,
|
||||
datatype, tree->tree_prev,
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
datatype, tree->tree_prev,
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, &recv_reqs[req_index]));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
/* wait for and forward the previous segment to children */
|
||||
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
|
||||
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
|
||||
MPI_STATUSES_IGNORE );
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
for( i = 0; i < tree->tree_nextsize; i++ ) {
|
||||
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
|
||||
for( i = 0; i < tree->tree_nextsize; i++ ) {
|
||||
#if defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
err = MCA_PML_CALL(send(tmpbuf, count_by_segment, datatype,
|
||||
tree->tree_next[i],
|
||||
tree->tree_next[i],
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
#else
|
||||
err = MCA_PML_CALL(isend(tmpbuf, count_by_segment, datatype,
|
||||
tree->tree_next[i],
|
||||
tree->tree_next[i],
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&send_reqs[i]));
|
||||
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
|
||||
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
|
||||
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
}
|
||||
|
||||
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
/* complete the sends before starting the next iteration */
|
||||
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
|
||||
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
|
||||
MPI_STATUSES_IGNORE );
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
|
||||
|
||||
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
|
||||
|
||||
/* Update the receive buffer */
|
||||
tmpbuf += realsegsize;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* Process the last segment */
|
||||
@ -199,31 +180,31 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
sendcount = original_count - (ptrdiff_t)(num_segments - 1) * count_by_segment;
|
||||
for( i = 0; i < tree->tree_nextsize; i++ ) {
|
||||
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
#if defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype,
|
||||
tree->tree_next[i],
|
||||
tree->tree_next[i],
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
#else
|
||||
err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
|
||||
tree->tree_next[i],
|
||||
tree->tree_next[i],
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
&send_reqs[i]));
|
||||
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
|
||||
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
|
||||
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
|
||||
|
||||
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
|
||||
MPI_STATUSES_IGNORE );
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
|
||||
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
|
||||
}
|
||||
|
||||
|
||||
/* Leaf nodes */
|
||||
else {
|
||||
/*
|
||||
/*
|
||||
Receive all segments from parent in a loop:
|
||||
1) post irecv for the first segment
|
||||
2) for segments 1 .. num_segments
|
||||
@ -241,12 +222,12 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
||||
req_index = req_index ^ 0x1;
|
||||
tmpbuf += realsegsize;
|
||||
/* post receive for the next segment */
|
||||
err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
|
||||
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
|
||||
err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
|
||||
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, &recv_reqs[req_index]));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
/* wait on the previous segment */
|
||||
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
|
||||
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
|
||||
MPI_STATUS_IGNORE );
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
@ -255,25 +236,25 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
|
||||
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
if( NULL != send_reqs ) free(send_reqs);
|
||||
#endif
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
|
||||
|
||||
error_hndl:
|
||||
OPAL_OUTPUT( (ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank) );
|
||||
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
|
||||
if( NULL != send_reqs ) free(send_reqs);
|
||||
#endif
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
ompi_coll_base_bcast_intra_bintree ( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
mca_coll_base_module_t *module,
|
||||
@ -281,28 +262,27 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
{
|
||||
int segcount = count;
|
||||
size_t typelng;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
mca_coll_base_comm_t *data = module->base_data;
|
||||
|
||||
COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root );
|
||||
COLL_BASE_UPDATE_BINTREE( comm, module, root );
|
||||
|
||||
/**
|
||||
* Determine number of elements sent per operation.
|
||||
*/
|
||||
ompi_datatype_type_size( datatype, &typelng );
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d",
|
||||
ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
|
||||
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
|
||||
segcount, data->cached_bintree );
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_pipeline( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
ompi_coll_base_bcast_intra_pipeline( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
mca_coll_base_module_t *module,
|
||||
@ -310,28 +290,27 @@ ompi_coll_tuned_bcast_intra_pipeline( void* buffer,
|
||||
{
|
||||
int segcount = count;
|
||||
size_t typelng;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
mca_coll_base_comm_t *data = module->base_data;
|
||||
|
||||
COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root );
|
||||
COLL_BASE_UPDATE_PIPELINE( comm, module, root );
|
||||
|
||||
/**
|
||||
* Determine number of elements sent per operation.
|
||||
*/
|
||||
ompi_datatype_type_size( datatype, &typelng );
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d",
|
||||
ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
|
||||
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
|
||||
segcount, data->cached_pipeline );
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_chain( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
ompi_coll_base_bcast_intra_chain( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
mca_coll_base_module_t *module,
|
||||
@ -339,28 +318,27 @@ ompi_coll_tuned_bcast_intra_chain( void* buffer,
|
||||
{
|
||||
int segcount = count;
|
||||
size_t typelng;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
mca_coll_base_comm_t *data = module->base_data;
|
||||
|
||||
COLL_TUNED_UPDATE_CHAIN( comm, tuned_module, root, chains );
|
||||
COLL_BASE_UPDATE_CHAIN( comm, module, root, chains );
|
||||
|
||||
/**
|
||||
* Determine number of elements sent per operation.
|
||||
*/
|
||||
ompi_datatype_type_size( datatype, &typelng );
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d",
|
||||
ompi_comm_rank(comm), chains, segsize, (unsigned long)typelng, segcount));
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
|
||||
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
|
||||
segcount, data->cached_chain );
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_binomial( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
ompi_coll_base_bcast_intra_binomial( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
mca_coll_base_module_t *module,
|
||||
@ -368,28 +346,27 @@ ompi_coll_tuned_bcast_intra_binomial( void* buffer,
|
||||
{
|
||||
int segcount = count;
|
||||
size_t typelng;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
mca_coll_base_comm_t *data = module->base_data;
|
||||
|
||||
COLL_TUNED_UPDATE_BMTREE( comm, tuned_module, root );
|
||||
COLL_BASE_UPDATE_BMTREE( comm, module, root );
|
||||
|
||||
/**
|
||||
* Determine number of elements sent per operation.
|
||||
*/
|
||||
ompi_datatype_type_size( datatype, &typelng );
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d",
|
||||
ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
|
||||
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
|
||||
segcount, data->cached_bmtree );
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
ompi_coll_base_bcast_intra_split_bintree ( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
mca_coll_base_module_t *module,
|
||||
@ -399,26 +376,25 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
uint32_t counts[2];
|
||||
int segcount[2]; /* Number of elements sent with each segment */
|
||||
int num_segments[2]; /* Number of segmenets */
|
||||
int sendcount[2]; /* the same like segcount, except for the last segment */
|
||||
int sendcount[2]; /* the same like segcount, except for the last segment */
|
||||
size_t realsegsize[2], type_size;
|
||||
char *tmpbuf[2];
|
||||
ptrdiff_t type_extent, lb;
|
||||
ompi_request_t *base_req, *new_req;
|
||||
ompi_coll_tree_t *tree;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
mca_coll_base_comm_t *data = module->base_data;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize));
|
||||
|
||||
if (size == 1) {
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* setup the binary tree topology. */
|
||||
COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root );
|
||||
COLL_BASE_UPDATE_BINTREE( comm, module, root );
|
||||
tree = data->cached_bintree;
|
||||
|
||||
err = ompi_datatype_type_size( datatype, &type_size );
|
||||
@ -431,10 +407,10 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
/* Note that ompi_datatype_type_size() will never return a negative
|
||||
value in typelng; it returns an int [vs. an unsigned type]
|
||||
because of the MPI spec. */
|
||||
if (segsize < ((uint32_t) type_size)) {
|
||||
if (segsize < ((uint32_t) type_size)) {
|
||||
segsize = type_size; /* push segsize up to hold one type */
|
||||
}
|
||||
segcount[0] = segcount[1] = segsize / type_size;
|
||||
segcount[0] = segcount[1] = segsize / type_size;
|
||||
num_segments[0] = counts[0]/segcount[0];
|
||||
if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
|
||||
num_segments[1] = counts[1]/segcount[1];
|
||||
@ -450,17 +426,17 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
(segsize > ((ptrdiff_t)counts[0] * type_size)) ||
|
||||
(segsize > ((ptrdiff_t)counts[1] * type_size)) ) {
|
||||
/* call linear version here ! */
|
||||
return (ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype,
|
||||
return (ompi_coll_base_bcast_intra_chain ( buffer, count, datatype,
|
||||
root, comm, module,
|
||||
segsize, 1 ));
|
||||
}
|
||||
|
||||
err = ompi_datatype_get_extent (datatype, &lb, &type_extent);
|
||||
|
||||
|
||||
/* Determine real segment size */
|
||||
realsegsize[0] = (ptrdiff_t)segcount[0] * type_extent;
|
||||
realsegsize[1] = (ptrdiff_t)segcount[1] * type_extent;
|
||||
|
||||
|
||||
/* set the buffer pointers */
|
||||
tmpbuf[0] = (char *) buffer;
|
||||
tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent;
|
||||
@ -473,11 +449,11 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
|
||||
/* determine if I am left (0) or right (1), (root is right) */
|
||||
lr = ((rank + size - root)%size + 1)%2;
|
||||
|
||||
|
||||
/* root code */
|
||||
if( rank == root ) {
|
||||
/* determine segment count */
|
||||
sendcount[0] = segcount[0];
|
||||
sendcount[0] = segcount[0];
|
||||
sendcount[1] = segcount[1];
|
||||
/* for each segment */
|
||||
for (segindex = 0; segindex < num_segments[0]; segindex++) {
|
||||
@ -487,7 +463,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
continue;
|
||||
}
|
||||
/* determine how many elements are being sent in this round */
|
||||
if(segindex == (num_segments[i] - 1))
|
||||
if(segindex == (num_segments[i] - 1))
|
||||
sendcount[i] = counts[i] - segindex*segcount[i];
|
||||
/* send data */
|
||||
MCA_PML_CALL(send(tmpbuf[i], sendcount[i], datatype,
|
||||
@ -498,19 +474,19 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
tmpbuf[i] += realsegsize[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* intermediate nodes code */
|
||||
else if( tree->tree_nextsize > 0 ) {
|
||||
else if( tree->tree_nextsize > 0 ) {
|
||||
/* Intermediate nodes:
|
||||
* It will receive segments only from one half of the data.
|
||||
* Which one is determined by whether the node belongs to the "left" or "right"
|
||||
* Which one is determined by whether the node belongs to the "left" or "right"
|
||||
* subtree. Topoloby building function builds binary tree such that
|
||||
* odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
|
||||
* and even on the right subtree.
|
||||
*
|
||||
* Create the pipeline. We first post the first receive, then in the loop we
|
||||
* post the next receive and after that wait for the previous receive to complete
|
||||
* post the next receive and after that wait for the previous receive to complete
|
||||
* and we disseminating the data to all children.
|
||||
*/
|
||||
sendcount[lr] = segcount[lr];
|
||||
@ -521,11 +497,11 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
|
||||
for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
|
||||
/* determine how many elements to expect in this round */
|
||||
if( segindex == (num_segments[lr] - 1))
|
||||
if( segindex == (num_segments[lr] - 1))
|
||||
sendcount[lr] = counts[lr] - (ptrdiff_t)segindex * (ptrdiff_t)segcount[lr];
|
||||
/* post new irecv */
|
||||
err = MCA_PML_CALL(irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
|
||||
datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
|
||||
datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, &new_req));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
@ -539,7 +515,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
} /* end of for each child */
|
||||
|
||||
/* upate the base request */
|
||||
base_req = new_req;
|
||||
base_req = new_req;
|
||||
/* go to the next buffer (ie. the one corresponding to the next recv) */
|
||||
tmpbuf[lr] += realsegsize[lr];
|
||||
} /* end of for segindex */
|
||||
@ -552,10 +528,10 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
} /* end of for each child */
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* leaf nodes */
|
||||
else {
|
||||
else {
|
||||
/* Just consume segments as fast as possible */
|
||||
sendcount[lr] = segcount[lr];
|
||||
for (segindex = 0; segindex < num_segments[lr]; segindex++) {
|
||||
@ -577,9 +553,9 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent;
|
||||
|
||||
/* Step 2:
|
||||
Find your immediate pair (identical node in opposite subtree) and SendRecv
|
||||
Find your immediate pair (identical node in opposite subtree) and SendRecv
|
||||
data buffer with them.
|
||||
The tree building function ensures that
|
||||
The tree building function ensures that
|
||||
if (we are not root)
|
||||
if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
|
||||
if we are in the right subtree (lr == 1) our pair is (rank-1)%size
|
||||
@ -591,9 +567,9 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
pair = (rank+size-1)%size;
|
||||
}
|
||||
|
||||
if ( (size%2) != 0 && rank != root) {
|
||||
if ( (size%2) != 0 && rank != root) {
|
||||
|
||||
err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
|
||||
err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
@ -607,28 +583,28 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
}
|
||||
}
|
||||
/* last node receives right buffer from the root */
|
||||
else if (rank == (root+size-1)%size) {
|
||||
err = MCA_PML_CALL(recv(tmpbuf[1], counts[1], datatype,
|
||||
root, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
}
|
||||
/* everyone else exchanges buffers */
|
||||
else {
|
||||
err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
|
||||
err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
}
|
||||
return (MPI_SUCCESS);
|
||||
|
||||
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return (err);
|
||||
}
|
||||
|
||||
@ -636,8 +612,8 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
/*
|
||||
* Linear functions are copied from the BASIC coll module
|
||||
* they do not segment the message and are simple implementations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as tuned/tree based segmenting operations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as base/tree based segmenting operations
|
||||
* and as such may be selected by the decision functions
|
||||
* These are copied into this module due to the way we select modules
|
||||
* in V1. i.e. in V2 we will handle this differently and so will not
|
||||
@ -655,21 +631,20 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
ompi_coll_base_bcast_intra_basic_linear(void *buff, int count,
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, size, rank, err;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
ompi_request_t **preq, **reqs = data->mcct_reqs;
|
||||
mca_coll_base_comm_t *data = module->base_data;
|
||||
ompi_request_t **preq, **reqs;
|
||||
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_basic_linear rank %d root %d", rank, root));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_bcast_intra_basic_linear rank %d root %d", rank, root));
|
||||
|
||||
/* Non-root receive the data. */
|
||||
|
||||
@ -680,8 +655,8 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
|
||||
}
|
||||
|
||||
/* Root sends data to all others. */
|
||||
|
||||
for (i = 0, preq = reqs; i < size; ++i) {
|
||||
preq = reqs = coll_base_comm_get_reqs(data, size-1);
|
||||
for (i = 0; i < size; ++i) {
|
||||
if (i == rank) {
|
||||
continue;
|
||||
}
|
||||
@ -691,6 +666,7 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, preq++));
|
||||
if (MPI_SUCCESS != err) {
|
||||
ompi_coll_base_free_reqs(data->mcct_reqs, i);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
@ -710,148 +686,11 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
|
||||
err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE);
|
||||
|
||||
/* Free the reqs */
|
||||
|
||||
ompi_coll_tuned_free_reqs(reqs, i);
|
||||
ompi_coll_base_free_reqs(reqs, i);
|
||||
|
||||
/* All done */
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/* copied function (with appropriate renaming) ends here */
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[BCAST] = coll_tuned_bcast_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_count",
|
||||
"Number of bcast algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_bcast_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_bcast_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_bcast_algorithms", bcast_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm",
|
||||
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: binomial tree.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_bcast_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_bcast_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_bcast_segment_size);
|
||||
|
||||
coll_tuned_bcast_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_bcast_tree_fanout);
|
||||
|
||||
coll_tuned_bcast_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_chain_fanout",
|
||||
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_bcast_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d",
|
||||
data->user_forced[BCAST].algorithm));
|
||||
|
||||
switch (data->user_forced[BCAST].algorithm) {
|
||||
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
|
||||
case (1): return ompi_coll_tuned_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
|
||||
case (2): return ompi_coll_tuned_bcast_intra_chain( buf, count, dtype, root, comm, module,
|
||||
data->user_forced[BCAST].segsize,
|
||||
data->user_forced[BCAST].chain_fanout );
|
||||
case (3): return ompi_coll_tuned_bcast_intra_pipeline( buf, count, dtype, root, comm, module,
|
||||
data->user_forced[BCAST].segsize );
|
||||
case (4): return ompi_coll_tuned_bcast_intra_split_bintree( buf, count, dtype, root, comm, module,
|
||||
data->user_forced[BCAST].segsize );
|
||||
case (5): return ompi_coll_tuned_bcast_intra_bintree( buf, count, dtype, root, comm, module,
|
||||
data->user_forced[BCAST].segsize );
|
||||
case (6): return ompi_coll_tuned_bcast_intra_binomial( buf, count, dtype, root, comm, module,
|
||||
data->user_forced[BCAST].segsize );
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
data->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
|
||||
} /* switch */
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
|
||||
case (1): return ompi_coll_tuned_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
|
||||
case (2): return ompi_coll_tuned_bcast_intra_chain( buf, count, dtype, root, comm, module, segsize, faninout );
|
||||
case (3): return ompi_coll_tuned_bcast_intra_pipeline( buf, count, dtype, root, comm, module, segsize );
|
||||
case (4): return ompi_coll_tuned_bcast_intra_split_bintree( buf, count, dtype, root, comm, module, segsize );
|
||||
case (5): return ompi_coll_tuned_bcast_intra_bintree( buf, count, dtype, root, comm, module, segsize );
|
||||
case (6): return ompi_coll_tuned_bcast_intra_binomial( buf, count, dtype, root, comm, module, segsize );
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
|
||||
} /* switch */
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
@ -3,10 +3,10 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
@ -15,9 +15,9 @@
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -33,6 +33,7 @@
|
||||
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
@ -49,10 +50,94 @@ static void coll_base_module_construct(mca_coll_base_module_t *m)
|
||||
/* zero out all functions */
|
||||
memset ((char *) m + sizeof (m->super), 0, sizeof (*m) - sizeof (m->super));
|
||||
m->coll_module_disable = NULL;
|
||||
m->base_data = NULL;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_coll_base_module_t, opal_object_t,
|
||||
coll_base_module_construct, NULL);
|
||||
static void
|
||||
coll_base_module_destruct(mca_coll_base_module_t *module)
|
||||
{
|
||||
if (NULL != module->base_data) {
|
||||
OBJ_RELEASE(module->base_data);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_coll_base_module_t, opal_object_t,
|
||||
coll_base_module_construct, coll_base_module_destruct);
|
||||
|
||||
|
||||
static void
|
||||
coll_base_comm_construct(mca_coll_base_comm_t *data)
|
||||
{
|
||||
data->mcct_reqs = NULL;
|
||||
data->mcct_num_reqs = 0;
|
||||
data->cached_ntree = NULL;
|
||||
data->cached_bintree = NULL;
|
||||
data->cached_bmtree = NULL;
|
||||
data->cached_in_order_bmtree = NULL;
|
||||
data->cached_chain = NULL;
|
||||
data->cached_pipeline = NULL;
|
||||
data->cached_in_order_bintree = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
coll_base_comm_destruct(mca_coll_base_comm_t *data)
|
||||
{
|
||||
if( NULL != data->mcct_reqs ) {
|
||||
for( int i = 0; i < data->mcct_num_reqs; ++i ) {
|
||||
if( MPI_REQUEST_NULL != data->mcct_reqs[i] )
|
||||
ompi_request_free(&data->mcct_reqs[i]);
|
||||
}
|
||||
free(data->mcct_reqs);
|
||||
data->mcct_reqs = NULL;
|
||||
data->mcct_num_reqs = 0;
|
||||
}
|
||||
assert(0 == data->mcct_num_reqs);
|
||||
|
||||
/* free any cached information that has been allocated */
|
||||
if (data->cached_ntree) { /* destroy general tree if defined */
|
||||
ompi_coll_base_topo_destroy_tree (&data->cached_ntree);
|
||||
}
|
||||
if (data->cached_bintree) { /* destroy bintree if defined */
|
||||
ompi_coll_base_topo_destroy_tree (&data->cached_bintree);
|
||||
}
|
||||
if (data->cached_bmtree) { /* destroy bmtree if defined */
|
||||
ompi_coll_base_topo_destroy_tree (&data->cached_bmtree);
|
||||
}
|
||||
if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */
|
||||
ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bmtree);
|
||||
}
|
||||
if (data->cached_chain) { /* destroy general chain if defined */
|
||||
ompi_coll_base_topo_destroy_tree (&data->cached_chain);
|
||||
}
|
||||
if (data->cached_pipeline) { /* destroy pipeline if defined */
|
||||
ompi_coll_base_topo_destroy_tree (&data->cached_pipeline);
|
||||
}
|
||||
if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
|
||||
ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bintree);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_coll_base_comm_t, opal_object_t,
|
||||
coll_base_comm_construct, coll_base_comm_destruct);
|
||||
|
||||
ompi_request_t** coll_base_comm_get_reqs(mca_coll_base_comm_t* data, int nreqs)
|
||||
{
|
||||
int startfrom = data->mcct_num_reqs;
|
||||
|
||||
if( NULL == data->mcct_reqs ) {
|
||||
assert(0 == data->mcct_num_reqs);
|
||||
data->mcct_reqs = (ompi_request_t**)malloc(sizeof(ompi_request_t*) * nreqs);
|
||||
} else if( data->mcct_num_reqs <= nreqs ) {
|
||||
data->mcct_reqs = (ompi_request_t**)realloc(data->mcct_reqs, sizeof(ompi_request_t*) * nreqs);
|
||||
}
|
||||
if( NULL != data->mcct_reqs ) {
|
||||
data->mcct_num_reqs = nreqs;
|
||||
for( int i = startfrom; i < data->mcct_num_reqs; i++ )
|
||||
data->mcct_reqs[i] = MPI_REQUEST_NULL;
|
||||
} else
|
||||
data->mcct_num_reqs = 0; /* nothing to return */
|
||||
return data->mcct_reqs;
|
||||
}
|
||||
|
||||
MCA_BASE_FRAMEWORK_DECLARE(ompi, coll, "Collectives", NULL, NULL, NULL,
|
||||
mca_coll_base_static_components, 0);
|
||||
|
355
ompi/mca/coll/base/coll_base_functions.h
Обычный файл
355
ompi/mca/coll/base/coll_base_functions.h
Обычный файл
@ -0,0 +1,355 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_COLL_BASE_EXPORT_H
|
||||
#define MCA_COLL_BASE_EXPORT_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/request/request.h"
|
||||
|
||||
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
|
||||
#include "coll_base_topo.h"
|
||||
|
||||
/* some fixed value index vars to simplify certain operations */
|
||||
typedef enum COLLTYPE {
|
||||
ALLGATHER = 0, /* 0 */
|
||||
ALLGATHERV, /* 1 */
|
||||
ALLREDUCE, /* 2 */
|
||||
ALLTOALL, /* 3 */
|
||||
ALLTOALLV, /* 4 */
|
||||
ALLTOALLW, /* 5 */
|
||||
BARRIER, /* 6 */
|
||||
BCAST, /* 7 */
|
||||
EXSCAN, /* 8 */
|
||||
GATHER, /* 9 */
|
||||
GATHERV, /* 10 */
|
||||
REDUCE, /* 11 */
|
||||
REDUCESCATTER, /* 12 */
|
||||
SCAN, /* 13 */
|
||||
SCATTER, /* 14 */
|
||||
SCATTERV, /* 15 */
|
||||
COLLCOUNT /* 16 end counter keep it as last element */
|
||||
} COLLTYPE_T;
|
||||
|
||||
/* defined arg lists to simply auto inclusion of user overriding decision functions */
|
||||
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define BARRIER_ARGS struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
/* end defined arg lists to simply auto inclusion of user overriding decision functions */
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* All Gather */
|
||||
int ompi_coll_base_allgather_intra_bruck(ALLGATHER_ARGS);
|
||||
int ompi_coll_base_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
|
||||
int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS);
|
||||
int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS);
|
||||
int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS);
|
||||
int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS);
|
||||
|
||||
/* All GatherV */
|
||||
int ompi_coll_base_allgatherv_intra_bruck(ALLGATHERV_ARGS);
|
||||
int ompi_coll_base_allgatherv_intra_ring(ALLGATHERV_ARGS);
|
||||
int ompi_coll_base_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
|
||||
int ompi_coll_base_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
|
||||
int ompi_coll_base_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
|
||||
|
||||
/* All Reduce */
|
||||
int ompi_coll_base_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
|
||||
int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
|
||||
int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS);
|
||||
int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
|
||||
int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
|
||||
|
||||
/* AlltoAll */
|
||||
int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS);
|
||||
int ompi_coll_base_alltoall_intra_bruck(ALLTOALL_ARGS);
|
||||
int ompi_coll_base_alltoall_intra_basic_linear(ALLTOALL_ARGS);
|
||||
int ompi_coll_base_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
|
||||
int ompi_coll_base_alltoall_intra_two_procs(ALLTOALL_ARGS);
|
||||
int mca_coll_base_alltoall_intra_basic_inplace(void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module); /* special version for INPLACE */
|
||||
|
||||
/* AlltoAllV */
|
||||
int ompi_coll_base_alltoallv_intra_pairwise(ALLTOALLV_ARGS);
|
||||
int ompi_coll_base_alltoallv_intra_basic_linear(ALLTOALLV_ARGS);
|
||||
int mca_coll_base_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module); /* special version for INPLACE */
|
||||
|
||||
/* AlltoAllW */
|
||||
|
||||
/* Barrier */
|
||||
int ompi_coll_base_barrier_intra_doublering(BARRIER_ARGS);
|
||||
int ompi_coll_base_barrier_intra_recursivedoubling(BARRIER_ARGS);
|
||||
int ompi_coll_base_barrier_intra_bruck(BARRIER_ARGS);
|
||||
int ompi_coll_base_barrier_intra_two_procs(BARRIER_ARGS);
|
||||
int ompi_coll_base_barrier_intra_tree(BARRIER_ARGS);
|
||||
int ompi_coll_base_barrier_intra_basic_linear(BARRIER_ARGS);
|
||||
|
||||
/* Bcast */
|
||||
int ompi_coll_base_bcast_intra_basic_linear(BCAST_ARGS);
|
||||
int ompi_coll_base_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
|
||||
int ompi_coll_base_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
|
||||
int ompi_coll_base_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
|
||||
int ompi_coll_base_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
|
||||
int ompi_coll_base_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
|
||||
|
||||
/* Exscan */
|
||||
|
||||
/* Gather */
|
||||
int ompi_coll_base_gather_intra_basic_linear(GATHER_ARGS);
|
||||
int ompi_coll_base_gather_intra_binomial(GATHER_ARGS);
|
||||
int ompi_coll_base_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
|
||||
|
||||
/* GatherV */
|
||||
|
||||
/* Reduce */
|
||||
int ompi_coll_base_reduce_intra_basic_linear(REDUCE_ARGS);
|
||||
int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
|
||||
int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||
int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||
int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||
int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||
|
||||
/* Reduce_scatter */
|
||||
int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
|
||||
int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
|
||||
int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
|
||||
|
||||
/* Scan */
|
||||
|
||||
/* Scatter */
|
||||
int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
|
||||
int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
|
||||
|
||||
/* ScatterV */
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#define COLL_BASE_UPDATE_BINTREE( OMPI_COMM, BASE_MODULE, ROOT ) \
|
||||
do { \
|
||||
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
|
||||
if( !( (coll_comm->cached_bintree) \
|
||||
&& (coll_comm->cached_bintree_root == (ROOT)) ) ) { \
|
||||
if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \
|
||||
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_bintree) ); \
|
||||
} \
|
||||
coll_comm->cached_bintree = ompi_coll_base_topo_build_tree(2,(OMPI_COMM),(ROOT)); \
|
||||
coll_comm->cached_bintree_root = (ROOT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_BASE_UPDATE_BMTREE( OMPI_COMM, BASE_MODULE, ROOT ) \
|
||||
do { \
|
||||
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
|
||||
if( !( (coll_comm->cached_bmtree) \
|
||||
&& (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \
|
||||
if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \
|
||||
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \
|
||||
} \
|
||||
coll_comm->cached_bmtree = ompi_coll_base_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \
|
||||
coll_comm->cached_bmtree_root = (ROOT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_BASE_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, BASE_MODULE, ROOT ) \
|
||||
do { \
|
||||
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
|
||||
if( !( (coll_comm->cached_in_order_bmtree) \
|
||||
&& (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \
|
||||
if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \
|
||||
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \
|
||||
} \
|
||||
coll_comm->cached_in_order_bmtree = ompi_coll_base_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \
|
||||
coll_comm->cached_in_order_bmtree_root = (ROOT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_BASE_UPDATE_PIPELINE( OMPI_COMM, BASE_MODULE, ROOT ) \
|
||||
do { \
|
||||
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
|
||||
if( !( (coll_comm->cached_pipeline) \
|
||||
&& (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \
|
||||
if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \
|
||||
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \
|
||||
} \
|
||||
coll_comm->cached_pipeline = ompi_coll_base_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \
|
||||
coll_comm->cached_pipeline_root = (ROOT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_BASE_UPDATE_CHAIN( OMPI_COMM, BASE_MODULE, ROOT, FANOUT ) \
|
||||
do { \
|
||||
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
|
||||
if( !( (coll_comm->cached_chain) \
|
||||
&& (coll_comm->cached_chain_root == (ROOT)) \
|
||||
&& (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \
|
||||
if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \
|
||||
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_chain) ); \
|
||||
} \
|
||||
coll_comm->cached_chain = ompi_coll_base_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \
|
||||
coll_comm->cached_chain_root = (ROOT); \
|
||||
coll_comm->cached_chain_fanout = (FANOUT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_BASE_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, BASE_MODULE ) \
|
||||
do { \
|
||||
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
|
||||
if( !(coll_comm->cached_in_order_bintree) ) { \
|
||||
/* In-order binary tree topology is defined by communicator size */ \
|
||||
/* Thus, there is no need to destroy anything */ \
|
||||
coll_comm->cached_in_order_bintree = \
|
||||
ompi_coll_base_topo_build_in_order_bintree((OMPI_COMM)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* This macro give a generic way to compute the best count of
|
||||
* the segment (i.e. the number of complete datatypes that
|
||||
* can fit in the specified SEGSIZE). Beware, when this macro
|
||||
* is called, the SEGCOUNT should be initialized to the count as
|
||||
* expected by the collective call.
|
||||
*/
|
||||
#define COLL_BASE_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \
|
||||
if( ((SEGSIZE) >= (TYPELNG)) && \
|
||||
((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \
|
||||
size_t residual; \
|
||||
(SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \
|
||||
residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \
|
||||
if( residual > ((TYPELNG) >> 1) ) \
|
||||
(SEGCOUNT)++; \
|
||||
} \
|
||||
|
||||
/**
|
||||
* This macro gives a generic wait to compute the well distributed block counts
|
||||
* when the count and number of blocks are fixed.
|
||||
* Macro returns "early-block" count, "late-block" count, and "split-index"
|
||||
* which is the block at which we switch from "early-block" count to
|
||||
* the "late-block" count.
|
||||
* count = split_index * early_block_count +
|
||||
* (block_count - split_index) * late_block_count
|
||||
* We do not perform ANY error checks - make sure that the input values
|
||||
* make sense (eg. count > num_blocks).
|
||||
*/
|
||||
#define COLL_BASE_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \
|
||||
EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
|
||||
EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \
|
||||
SPLIT_INDEX = COUNT % NUM_BLOCKS; \
|
||||
if (0 != SPLIT_INDEX) { \
|
||||
EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \
|
||||
} \
|
||||
|
||||
/*
|
||||
* Data structure for hanging data off the communicator
|
||||
* i.e. per module instance
|
||||
*/
|
||||
struct mca_coll_base_comm_t {
|
||||
opal_object_t super;
|
||||
|
||||
/* standard data for requests and PML usage */
|
||||
|
||||
/* Precreate space for requests
|
||||
* Note this does not effect basic,
|
||||
* but if in wrong context can confuse a debugger
|
||||
* this is controlled by an MCA param
|
||||
*/
|
||||
|
||||
ompi_request_t **mcct_reqs;
|
||||
int mcct_num_reqs;
|
||||
|
||||
/*
|
||||
* base topo information caching per communicator
|
||||
*
|
||||
* for each communicator we cache the topo information so we can
|
||||
* reuse without regenerating if we change the root, [or fanout]
|
||||
* then regenerate and recache this information
|
||||
*/
|
||||
|
||||
/* general tree with n fan out */
|
||||
ompi_coll_tree_t *cached_ntree;
|
||||
int cached_ntree_root;
|
||||
int cached_ntree_fanout;
|
||||
|
||||
/* binary tree */
|
||||
ompi_coll_tree_t *cached_bintree;
|
||||
int cached_bintree_root;
|
||||
|
||||
/* binomial tree */
|
||||
ompi_coll_tree_t *cached_bmtree;
|
||||
int cached_bmtree_root;
|
||||
|
||||
/* binomial tree */
|
||||
ompi_coll_tree_t *cached_in_order_bmtree;
|
||||
int cached_in_order_bmtree_root;
|
||||
|
||||
/* chained tree (fanout followed by pipelines) */
|
||||
ompi_coll_tree_t *cached_chain;
|
||||
int cached_chain_root;
|
||||
int cached_chain_fanout;
|
||||
|
||||
/* pipeline */
|
||||
ompi_coll_tree_t *cached_pipeline;
|
||||
int cached_pipeline_root;
|
||||
|
||||
/* in-order binary tree (root of the in-order binary tree is rank 0) */
|
||||
ompi_coll_tree_t *cached_in_order_bintree;
|
||||
};
|
||||
typedef struct mca_coll_base_comm_t mca_coll_base_comm_t;
|
||||
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t);
|
||||
|
||||
static inline void ompi_coll_base_free_reqs(ompi_request_t **reqs, int count)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < count; ++i)
|
||||
ompi_request_free(&reqs[i]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the array of requests on the data. If the array was not initialized
|
||||
* or if it's size was too small, allocate it to fit the requested size.
|
||||
*/
|
||||
ompi_request_t** coll_base_comm_get_reqs(mca_coll_base_comm_t* data, int nreqs);
|
||||
|
||||
#endif /* MCA_COLL_BASE_EXPORT_H */
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -30,30 +30,14 @@
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_util.h"
|
||||
|
||||
/* gather algorithm variables */
|
||||
static int coll_tuned_gather_algorithm_count = 3;
|
||||
static int coll_tuned_gather_forced_algorithm = 0;
|
||||
static int coll_tuned_gather_segment_size = 0;
|
||||
static int coll_tuned_gather_tree_fanout;
|
||||
static int coll_tuned_gather_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_gather_forced_algorithm */
|
||||
static mca_base_var_enum_value_t gather_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "binomial"},
|
||||
{3, "linear_sync"},
|
||||
{0, NULL}
|
||||
};
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain,
|
||||
* gather_intra_pipeline, segmentation? */
|
||||
int
|
||||
ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
||||
ompi_coll_base_gather_intra_binomial(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -65,19 +49,19 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
||||
char *ptmp = NULL, *tempbuf = NULL;
|
||||
ompi_coll_tree_t* bmtree;
|
||||
MPI_Status status;
|
||||
MPI_Aint sextent, slb, strue_lb, strue_extent;
|
||||
MPI_Aint sextent, slb, strue_lb, strue_extent;
|
||||
MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
|
||||
mca_coll_base_comm_t *data = base_module->base_data;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_gather_intra_binomial rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_gather_intra_binomial rank %d", rank));
|
||||
|
||||
/* create the binomial tree */
|
||||
COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
|
||||
COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
|
||||
bmtree = data->cached_in_order_bmtree;
|
||||
|
||||
ompi_datatype_get_extent(sdtype, &slb, &sextent);
|
||||
@ -112,7 +96,7 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
} else {
|
||||
/* copy from rbuf to temp buffer */
|
||||
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
|
||||
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
|
||||
(char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
@ -157,8 +141,8 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
||||
mycount = size - vkid;
|
||||
mycount *= rcount;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_gather_intra_binomial rank %d recv %d mycount = %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d",
|
||||
rank, bmtree->tree_next[i], mycount));
|
||||
|
||||
err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype,
|
||||
@ -172,8 +156,8 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
||||
|
||||
if (rank != root) {
|
||||
/* all nodes except root send to parents */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_gather_intra_binomial rank %d send %d count %d\n",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n",
|
||||
rank, bmtree->tree_prev, total_recv));
|
||||
|
||||
err = MCA_PML_CALL(send(ptmp, total_recv, sdtype,
|
||||
@ -207,7 +191,7 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
||||
if (NULL != tempbuf)
|
||||
free(tempbuf);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
@ -220,25 +204,25 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
ompi_coll_base_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int first_segment_size)
|
||||
{
|
||||
int i, ret, line, rank, size, first_segment_count;
|
||||
ompi_request_t **reqs = NULL;
|
||||
MPI_Aint extent, lb;
|
||||
size_t typelng;
|
||||
ompi_request_t **reqs = NULL;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));
|
||||
|
||||
if (rank != root) {
|
||||
/* Non-root processes:
|
||||
@ -250,10 +234,10 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
ompi_datatype_type_size(sdtype, &typelng);
|
||||
ompi_datatype_get_extent(sdtype, &lb, &extent);
|
||||
first_segment_count = scount;
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
|
||||
COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
|
||||
first_segment_count );
|
||||
|
||||
ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root,
|
||||
ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root,
|
||||
MCA_COLL_BASE_TAG_GATHER,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
@ -263,15 +247,15 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count,
|
||||
(scount - first_segment_count), sdtype,
|
||||
ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count,
|
||||
(scount - first_segment_count), sdtype,
|
||||
root, MCA_COLL_BASE_TAG_GATHER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
} else {
|
||||
|
||||
/* Root process,
|
||||
/* Root process,
|
||||
- For every non-root node:
|
||||
- post irecv for the first segment of the message
|
||||
- send zero byte message to signal node to send the message
|
||||
@ -284,20 +268,20 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
ompi_request_t *first_segment_req;
|
||||
reqs = (ompi_request_t**) calloc(size, sizeof(ompi_request_t*));
|
||||
if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
ompi_datatype_type_size(rdtype, &typelng);
|
||||
ompi_datatype_get_extent(rdtype, &lb, &extent);
|
||||
first_segment_count = rcount;
|
||||
COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
|
||||
COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
|
||||
first_segment_count );
|
||||
|
||||
ptmp = (char *) rbuf;
|
||||
for (i = 0; i < size; ++i) {
|
||||
if (i == rank) {
|
||||
if (i == rank) {
|
||||
/* skip myself */
|
||||
reqs[i] = MPI_REQUEST_NULL;
|
||||
continue;
|
||||
}
|
||||
reqs[i] = MPI_REQUEST_NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* irecv for the first segment from i */
|
||||
ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent;
|
||||
@ -305,7 +289,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
MCA_COLL_BASE_TAG_GATHER, comm,
|
||||
&first_segment_req));
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
/* send sync message */
|
||||
ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i,
|
||||
MCA_COLL_BASE_TAG_GATHER,
|
||||
@ -314,7 +298,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
|
||||
/* irecv for the second segment */
|
||||
ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent;
|
||||
ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count),
|
||||
ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count),
|
||||
rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm,
|
||||
&reqs[i]));
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
@ -327,11 +311,11 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
/* copy local data if necessary */
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
ret = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
|
||||
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
|
||||
rcount, rdtype);
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
|
||||
|
||||
/* wait all second segments to complete */
|
||||
ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE);
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
@ -346,8 +330,8 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
if (NULL != reqs) {
|
||||
free(reqs);
|
||||
}
|
||||
OPAL_OUTPUT (( ompi_coll_tuned_stream,
|
||||
"ERROR_HNDL: node %d file %s line %d error %d\n",
|
||||
OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
|
||||
"ERROR_HNDL: node %d file %s line %d error %d\n",
|
||||
rank, __FILE__, line, ret ));
|
||||
return ret;
|
||||
}
|
||||
@ -355,13 +339,13 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
/*
|
||||
* Linear functions are copied from the BASIC coll module
|
||||
* they do not segment the message and are simple implementations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as tuned/tree based segmenting operations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as base/tree based segmenting operations
|
||||
* and as such may be selected by the decision functions
|
||||
* These are copied into this module due to the way we select modules
|
||||
* in V1. i.e. in V2 we will handle this differently and so will not
|
||||
* have to duplicate code.
|
||||
* JPG following the examples from other coll_tuned implementations. Dec06.
|
||||
* JPG following the examples from other coll_base implementations. Dec06.
|
||||
*/
|
||||
|
||||
/* copied function (with appropriate renaming) starts here */
|
||||
@ -373,7 +357,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
|
||||
ompi_coll_base_gather_intra_basic_linear(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
@ -389,8 +373,8 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
/* Everyone but root sends data and returns. */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_gather_intra_basic_linear rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_gather_intra_basic_linear rank %d", rank));
|
||||
|
||||
if (rank != root) {
|
||||
return MCA_PML_CALL(send(sbuf, scount, sdtype, root,
|
||||
@ -427,164 +411,3 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
|
||||
|
||||
|
||||
/* copied function (with appropriate renaming) ends here */
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map
|
||||
routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values
|
||||
and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int
|
||||
ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[GATHER] = coll_tuned_gather_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm_count",
|
||||
"Number of gather algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_gather_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_gather_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_gather_algorithms", gather_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm",
|
||||
"Which gather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 linear with synchronization.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_gather_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_gather_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for gather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_gather_segment_size);
|
||||
|
||||
coll_tuned_gather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for gather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_gather_tree_fanout);
|
||||
|
||||
coll_tuned_gather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm_chain_fanout",
|
||||
"Fanout for chains used for gather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_gather_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:gather_intra_do_forced selected algorithm %d",
|
||||
data->user_forced[GATHER].algorithm));
|
||||
|
||||
switch (data->user_forced[GATHER].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (3):
|
||||
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
data->user_forced[GATHER].segsize);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
data->user_forced[GATHER].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[GATHER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (3):
|
||||
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[GATHER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -32,37 +32,21 @@
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
|
||||
/* reduce_scatter algorithm variables */
|
||||
static int coll_tuned_reduce_scatter_algorithm_count = 2;
|
||||
static int coll_tuned_reduce_scatter_forced_algorithm = 0;
|
||||
static int coll_tuned_reduce_scatter_segment_size = 0;
|
||||
static int coll_tuned_reduce_scatter_tree_fanout;
|
||||
static int coll_tuned_reduce_scatter_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_reduce_scatter_forced_algorithm */
|
||||
static mca_base_var_enum_value_t reduce_scatter_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "non-overlapping"},
|
||||
{2, "recursive_halfing"},
|
||||
{3, "ring"},
|
||||
{0, NULL}
|
||||
};
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
|
||||
/*******************************************************************************
|
||||
* ompi_coll_tuned_reduce_scatter_intra_nonoverlapping
|
||||
* ompi_coll_base_reduce_scatter_intra_nonoverlapping
|
||||
*
|
||||
* This function just calls a reduce to rank 0, followed by an
|
||||
* This function just calls a reduce to rank 0, followed by an
|
||||
* appropriate scatterv call.
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
||||
int ompi_coll_base_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
||||
int *rcounts,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int err, i, rank, size, total_count, *displs = NULL;
|
||||
const int root = 0;
|
||||
@ -71,7 +55,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_nonoverlapping, rank %d", rank));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_nonoverlapping, rank %d", rank));
|
||||
|
||||
for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; }
|
||||
|
||||
@ -80,7 +64,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
/* rbuf on root (0) is big enough to hold whole data */
|
||||
if (root == rank) {
|
||||
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count,
|
||||
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count,
|
||||
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
|
||||
} else {
|
||||
err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count,
|
||||
@ -91,13 +75,13 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
||||
/* We must allocate temporary receive buffer on root to ensure that
|
||||
rbuf is big enough */
|
||||
ptrdiff_t lb, extent, tlb, textent;
|
||||
|
||||
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
ompi_datatype_get_true_extent(dtype, &tlb, &textent);
|
||||
|
||||
tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent);
|
||||
tmprbuf = tmprbuf_free - lb;
|
||||
}
|
||||
}
|
||||
err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count,
|
||||
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
|
||||
}
|
||||
@ -105,7 +89,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
||||
if (NULL != tmprbuf_free) free(tmprbuf_free);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
displs = (int*) malloc(size * sizeof(int));
|
||||
displs[0] = 0;
|
||||
for (i = 1; i < size; i++) {
|
||||
@ -122,7 +106,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
||||
|
||||
/*
|
||||
* Recursive-halving function is (*mostly*) copied from the BASIC coll module.
|
||||
* I have removed the part which handles "large" message sizes
|
||||
* I have removed the part which handles "large" message sizes
|
||||
* (non-overlapping version of reduce_Scatter).
|
||||
*/
|
||||
|
||||
@ -131,15 +115,15 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
||||
/*
|
||||
* reduce_scatter_intra_basic_recursivehalving
|
||||
*
|
||||
* Function: - reduce scatter implementation using recursive-halving
|
||||
* Function: - reduce scatter implementation using recursive-halving
|
||||
* algorithm
|
||||
* Accepts: - same as MPI_Reduce_scatter()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
* Limitation: - Works only for commutative operations.
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
void *rbuf,
|
||||
ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
void *rbuf,
|
||||
int *rcounts,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
@ -151,12 +135,12 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
|
||||
char *recv_buf = NULL, *recv_buf_free = NULL;
|
||||
char *result_buf = NULL, *result_buf_free = NULL;
|
||||
|
||||
|
||||
/* Initialize */
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_basic_recursivehalving, rank %d", rank));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_basic_recursivehalving, rank %d", rank));
|
||||
|
||||
/* Find displacements and the like */
|
||||
disps = (int*) malloc(sizeof(int) * size);
|
||||
@ -191,43 +175,43 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
|
||||
/* allocate temporary buffer for results */
|
||||
result_buf_free = (char*) malloc(buf_size);
|
||||
result_buf = result_buf_free - true_lb;
|
||||
|
||||
|
||||
/* copy local buffer into the temporary results */
|
||||
err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype);
|
||||
if (OMPI_SUCCESS != err) goto cleanup;
|
||||
|
||||
|
||||
/* figure out power of two mapping: grow until larger than
|
||||
comm size, then go back one, to get the largest power of
|
||||
two less than comm size */
|
||||
tmp_size = opal_next_poweroftwo (size);
|
||||
tmp_size = opal_next_poweroftwo (size);
|
||||
tmp_size >>= 1;
|
||||
remain = size - tmp_size;
|
||||
|
||||
|
||||
/* If comm size is not a power of two, have the first "remain"
|
||||
procs with an even rank send to rank + 1, leaving a power of
|
||||
two procs to do the rest of the algorithm */
|
||||
if (rank < 2 * remain) {
|
||||
if ((rank & 1) == 0) {
|
||||
err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
|
||||
err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm));
|
||||
if (OMPI_SUCCESS != err) goto cleanup;
|
||||
|
||||
|
||||
/* we don't participate from here on out */
|
||||
tmp_rank = -1;
|
||||
} else {
|
||||
err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
|
||||
|
||||
/* integrate their results into our temp results */
|
||||
ompi_op_reduce(op, recv_buf, result_buf, count, dtype);
|
||||
|
||||
|
||||
/* adjust rank to be the bottom "remain" ranks */
|
||||
tmp_rank = rank / 2;
|
||||
}
|
||||
@ -236,13 +220,13 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
remain" ranks dropped out */
|
||||
tmp_rank = rank - remain;
|
||||
}
|
||||
|
||||
|
||||
/* For ranks not kicked out by the above code, perform the
|
||||
recursive halving */
|
||||
if (tmp_rank >= 0) {
|
||||
int *tmp_disps = NULL, *tmp_rcounts = NULL;
|
||||
int mask, send_index, recv_index, last_index;
|
||||
|
||||
|
||||
/* recalculate disps and rcounts to account for the
|
||||
special "remainder" processes that are no longer doing
|
||||
anything */
|
||||
@ -317,11 +301,11 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
free(tmp_rcounts);
|
||||
free(tmp_disps);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (send_count > 0) {
|
||||
err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent,
|
||||
send_count, dtype, peer,
|
||||
send_count, dtype, peer,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm));
|
||||
@ -329,7 +313,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
free(tmp_rcounts);
|
||||
free(tmp_disps);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* if we received something on this step, push it into
|
||||
@ -340,10 +324,10 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
free(tmp_rcounts);
|
||||
free(tmp_disps);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
ompi_op_reduce(op,
|
||||
recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
|
||||
ompi_op_reduce(op,
|
||||
recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
|
||||
result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
|
||||
recv_count, dtype);
|
||||
}
|
||||
@ -357,13 +341,13 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
/* copy local results from results buffer into real receive buffer */
|
||||
if (0 != rcounts[rank]) {
|
||||
err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
|
||||
rcounts[rank], dtype,
|
||||
rcounts[rank], dtype,
|
||||
rbuf, rcounts[rank], dtype);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
free(tmp_rcounts);
|
||||
free(tmp_disps);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(tmp_rcounts);
|
||||
@ -389,7 +373,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
comm));
|
||||
if (OMPI_SUCCESS != err) goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
@ -404,18 +388,18 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
|
||||
|
||||
/*
|
||||
* ompi_coll_tuned_reduce_scatter_intra_ring
|
||||
* ompi_coll_base_reduce_scatter_intra_ring
|
||||
*
|
||||
* Function: Ring algorithm for reduce_scatter operation
|
||||
* Accepts: Same as MPI_Reduce_scatter()
|
||||
* Returns: MPI_SUCCESS or error code
|
||||
*
|
||||
* Description: Implements ring algorithm for reduce_scatter:
|
||||
* the block sizes defined in rcounts are exchanged and
|
||||
* Description: Implements ring algorithm for reduce_scatter:
|
||||
* the block sizes defined in rcounts are exchanged and
|
||||
8 updated until they reach proper destination.
|
||||
* Algorithm requires 2 * max(rcounts) extra buffering
|
||||
*
|
||||
* Limitations: The algorithm DOES NOT preserve order of operations so it
|
||||
* Limitations: The algorithm DOES NOT preserve order of operations so it
|
||||
* can be used only for commutative operations.
|
||||
* Example on 5 nodes:
|
||||
* Initial state
|
||||
@ -427,7 +411,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
* [04] -> [14] [24] [34] [44]
|
||||
*
|
||||
* COMPUTATION PHASE
|
||||
* Step 0: rank r sends block (r-1) to rank (r+1) and
|
||||
* Step 0: rank r sends block (r-1) to rank (r+1) and
|
||||
* receives block (r+1) from rank (r-1) [with wraparound].
|
||||
* # 0 1 2 3 4
|
||||
* [00] [10] [10+20] -> [30] [40]
|
||||
@ -435,12 +419,12 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
* -> [02] [12] [22] [32] [32+42] -->..
|
||||
* [43+03] -> [13] [23] [33] [43]
|
||||
* [04] [04+14] -> [24] [34] [44]
|
||||
*
|
||||
*
|
||||
* Step 1:
|
||||
* # 0 1 2 3 4
|
||||
* [00] [10] [10+20] [10+20+30] -> [40]
|
||||
* -> [01] [11] [21] [21+31] [21+31+41] ->
|
||||
* [32+42+02] -> [12] [22] [32] [32+42]
|
||||
* [32+42+02] -> [12] [22] [32] [32+42]
|
||||
* [03] [43+03+13] -> [23] [33] [43]
|
||||
* [04] [04+14] [04+14+24] -> [34] [44]
|
||||
*
|
||||
@ -448,7 +432,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
* # 0 1 2 3 4
|
||||
* -> [00] [10] [10+20] [10+20+30] [10+20+30+40] ->
|
||||
* [21+31+41+01]-> [11] [21] [21+31] [21+31+41]
|
||||
* [32+42+02] [32+42+02+12]-> [22] [32] [32+42]
|
||||
* [32+42+02] [32+42+02+12]-> [22] [32] [32+42]
|
||||
* [03] [43+03+13] [43+03+13+23]-> [33] [43]
|
||||
* [04] [04+14] [04+14+24] [04+14+24+34] -> [44]
|
||||
*
|
||||
@ -456,14 +440,14 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
||||
* # 0 1 2 3 4
|
||||
* [10+20+30+40+00] [10] [10+20] [10+20+30] [10+20+30+40]
|
||||
* [21+31+41+01] [21+31+41+01+11] [21] [21+31] [21+31+41]
|
||||
* [32+42+02] [32+42+02+12] [32+42+02+12+22] [32] [32+42]
|
||||
* [32+42+02] [32+42+02+12] [32+42+02+12+22] [32] [32+42]
|
||||
* [03] [43+03+13] [43+03+13+23] [43+03+13+23+33] [43]
|
||||
* [04] [04+14] [04+14+24] [04+14+24+34] [04+14+24+34+44]
|
||||
* DONE :)
|
||||
*
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
int
|
||||
ompi_coll_base_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
@ -480,11 +464,11 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:reduce_scatter_intra_ring rank %d, size %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:reduce_scatter_intra_ring rank %d, size %d",
|
||||
rank, size));
|
||||
|
||||
/* Determine the maximum number of elements per node,
|
||||
/* Determine the maximum number of elements per node,
|
||||
corresponding block size, and displacements array.
|
||||
*/
|
||||
displs = (int*) malloc(size * sizeof(int));
|
||||
@ -492,16 +476,16 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
displs[0] = 0;
|
||||
total_count = rcounts[0];
|
||||
max_block_count = rcounts[0];
|
||||
for (i = 1; i < size; i++) {
|
||||
for (i = 1; i < size; i++) {
|
||||
displs[i] = total_count;
|
||||
total_count += rcounts[i];
|
||||
if (max_block_count < rcounts[i]) max_block_count = rcounts[i];
|
||||
}
|
||||
|
||||
|
||||
/* Special case for size == 1 */
|
||||
if (1 == size) {
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
|
||||
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
|
||||
(char*)rbuf, (char*)sbuf);
|
||||
if (ret < 0) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
@ -541,13 +525,13 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
sbuf = rbuf;
|
||||
}
|
||||
|
||||
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
|
||||
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
|
||||
accumbuf, (char*)sbuf);
|
||||
if (ret < 0) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
/* Computation loop */
|
||||
|
||||
/*
|
||||
/*
|
||||
For each of the remote nodes:
|
||||
- post irecv for block (r-2) from (r-1) with wrap around
|
||||
- send block (r-1) to (r+1)
|
||||
@ -568,7 +552,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
inbi = 0;
|
||||
/* Initialize first receive from the neighbor on the left */
|
||||
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
||||
&reqs[inbi]));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
tmpsend = accumbuf + (ptrdiff_t)displs[recv_from] * extent;
|
||||
@ -579,25 +563,25 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
|
||||
for (k = 2; k < size; k++) {
|
||||
const int prevblock = (rank + size - k) % size;
|
||||
|
||||
|
||||
inbi = inbi ^ 0x1;
|
||||
|
||||
/* Post irecv for the current block */
|
||||
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
||||
&reqs[inbi]));
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
/* Wait on previous block to arrive */
|
||||
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
|
||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
|
||||
/* Apply operation on previous block: result goes to rbuf
|
||||
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
|
||||
*/
|
||||
tmprecv = accumbuf + (ptrdiff_t)displs[prevblock] * extent;
|
||||
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, rcounts[prevblock], dtype);
|
||||
|
||||
|
||||
/* send previous block to send_to */
|
||||
ret = MCA_PML_CALL(send(tmprecv, rcounts[prevblock], dtype, send_to,
|
||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||
@ -613,7 +597,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
rbuf[rank] = inbuf[inbi] (op) rbuf[rank] */
|
||||
tmprecv = accumbuf + (ptrdiff_t)displs[rank] * extent;
|
||||
ompi_op_reduce(op, inbuf[inbi], tmprecv, rcounts[rank], dtype);
|
||||
|
||||
|
||||
/* Copy result from tmprecv to rbuf */
|
||||
ret = ompi_datatype_copy_content_same_ddt(dtype, rcounts[rank], (char *)rbuf, tmprecv);
|
||||
if (ret < 0) { line = __LINE__; goto error_hndl; }
|
||||
@ -626,7 +610,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
return MPI_SUCCESS;
|
||||
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
|
||||
__FILE__, line, rank, ret));
|
||||
if (NULL != displs) free(displs);
|
||||
if (NULL != accumbuf_free) free(accumbuf_free);
|
||||
@ -634,139 +618,3 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
||||
if (NULL != inbuf_free[1]) free(inbuf_free[1]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The following are used by dynamic and forced rules
|
||||
*
|
||||
* publish details of each algorithm and if its forced/fixed/locked in
|
||||
* as you add methods/algorithms you must update this and the query/map routines
|
||||
*
|
||||
* this routine is called by the component only
|
||||
* this makes sure that the mca parameters are set to their initial values and
|
||||
* perms module does not call this they call the forced_getvalues routine
|
||||
* instead
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER] = coll_tuned_reduce_scatter_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm_count",
|
||||
"Number of reduce_scatter algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_reduce_scatter_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_reduce_scatter_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_reduce_scatter_algorithms", reduce_scatter_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm",
|
||||
"Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_scatter_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_reduce_scatter_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_scatter_segment_size);
|
||||
|
||||
coll_tuned_reduce_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_scatter_tree_fanout);
|
||||
|
||||
coll_tuned_reduce_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm_chain_fanout",
|
||||
"Fanout for chains used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_scatter_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_reduce_scatter_intra_do_forced(void *sbuf, void* rbuf,
|
||||
int *rcounts,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced selected algorithm %d",
|
||||
data->user_forced[REDUCESCATTER].algorithm));
|
||||
|
||||
switch (data->user_forced[REDUCESCATTER].algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
data->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_reduce_scatter_intra_do_this(void *sbuf, void* rbuf,
|
||||
int *rcounts,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
}
|
||||
|
256
ompi/mca/coll/base/coll_base_scatter.c
Обычный файл
256
ompi/mca/coll/base/coll_base_scatter.c
Обычный файл
@ -0,0 +1,256 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
int
|
||||
ompi_coll_base_scatter_intra_binomial(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int line = -1, i, rank, vrank, size, total_send = 0, err;
|
||||
char *ptmp, *tempbuf = NULL;
|
||||
ompi_coll_tree_t* bmtree;
|
||||
MPI_Status status;
|
||||
MPI_Aint sextent, slb, strue_lb, strue_extent;
|
||||
MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
|
||||
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
|
||||
mca_coll_base_comm_t *data = base_module->base_data;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"ompi_coll_base_scatter_intra_binomial rank %d", rank));
|
||||
|
||||
/* create the binomial tree */
|
||||
COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
|
||||
bmtree = data->cached_in_order_bmtree;
|
||||
|
||||
ompi_datatype_get_extent(sdtype, &slb, &sextent);
|
||||
ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent);
|
||||
ompi_datatype_get_extent(rdtype, &rlb, &rextent);
|
||||
ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent);
|
||||
|
||||
vrank = (rank - root + size) % size;
|
||||
ptmp = (char *) rbuf; /* by default suppose leaf nodes, just use rbuf */
|
||||
|
||||
if (rank == root) {
|
||||
if (0 == root) {
|
||||
/* root on 0, just use the send buffer */
|
||||
ptmp = (char *) sbuf;
|
||||
if (rbuf != MPI_IN_PLACE) {
|
||||
/* local copy to rbuf */
|
||||
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
} else {
|
||||
/* root is not on 0, allocate temp buffer for send */
|
||||
tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent);
|
||||
if (NULL == tempbuf) {
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
||||
}
|
||||
|
||||
ptmp = tempbuf - strue_lb;
|
||||
|
||||
/* and rotate data so they will eventually in the right place */
|
||||
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root),
|
||||
ptmp, (char *) sbuf + sextent * (ptrdiff_t)root * (ptrdiff_t)scount);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
|
||||
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)root,
|
||||
ptmp + sextent * (ptrdiff_t)scount * (ptrdiff_t)(size - root), (char *)sbuf);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
if (rbuf != MPI_IN_PLACE) {
|
||||
/* local copy to rbuf */
|
||||
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
|
||||
rbuf, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
}
|
||||
total_send = scount;
|
||||
} else if (!(vrank % 2)) {
|
||||
/* non-root, non-leaf nodes, allocte temp buffer for recv
|
||||
* the most we need is rcount*size/2 */
|
||||
tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent);
|
||||
if (NULL == tempbuf) {
|
||||
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
||||
}
|
||||
|
||||
ptmp = tempbuf - rtrue_lb;
|
||||
|
||||
sdtype = rdtype;
|
||||
scount = rcount;
|
||||
sextent = rextent;
|
||||
total_send = scount;
|
||||
}
|
||||
|
||||
if (!(vrank % 2)) {
|
||||
if (rank != root) {
|
||||
/* recv from parent on non-root */
|
||||
err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)rcount * (ptrdiff_t)size, rdtype, bmtree->tree_prev,
|
||||
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
/* local copy to rbuf */
|
||||
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
|
||||
rbuf, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
/* send to children on all non-leaf */
|
||||
for (i = 0; i < bmtree->tree_nextsize; i++) {
|
||||
size_t mycount = 0;
|
||||
int vkid;
|
||||
/* figure out how much data I have to send to this child */
|
||||
vkid = (bmtree->tree_next[i] - root + size) % size;
|
||||
mycount = vkid - vrank;
|
||||
if( (int)mycount > (size - vkid) )
|
||||
mycount = size - vkid;
|
||||
mycount *= scount;
|
||||
|
||||
err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)total_send * sextent, mycount, sdtype,
|
||||
bmtree->tree_next[i],
|
||||
MCA_COLL_BASE_TAG_SCATTER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
total_send += mycount;
|
||||
}
|
||||
|
||||
if (NULL != tempbuf)
|
||||
free(tempbuf);
|
||||
} else {
|
||||
/* recv from parent on leaf nodes */
|
||||
err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, bmtree->tree_prev,
|
||||
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
if (NULL != tempbuf)
|
||||
free(tempbuf);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Linear functions are copied from the BASIC coll module
|
||||
* they do not segment the message and are simple implementations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as base/tree based segmenting operations
|
||||
* and as such may be selected by the decision functions
|
||||
* These are copied into this module due to the way we select modules
|
||||
* in V1. i.e. in V2 we will handle this differently and so will not
|
||||
* have to duplicate code.
|
||||
* JPG following the examples from other coll_base implementations. Dec06.
|
||||
*/
|
||||
|
||||
/* copied function (with appropriate renaming) starts here */
|
||||
/*
|
||||
* scatter_intra
|
||||
*
|
||||
* Function: - basic scatter operation
|
||||
* Accepts: - same arguments as MPI_Scatter()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_base_scatter_intra_basic_linear(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, rank, size, err;
|
||||
ptrdiff_t lb, incr;
|
||||
char *ptmp;
|
||||
|
||||
/* Initialize */
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
|
||||
/* If not root, receive data. */
|
||||
|
||||
if (rank != root) {
|
||||
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
|
||||
MCA_COLL_BASE_TAG_SCATTER,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
return err;
|
||||
}
|
||||
|
||||
/* I am the root, loop sending data. */
|
||||
|
||||
err = ompi_datatype_get_extent(sdtype, &lb, &incr);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
incr *= scount;
|
||||
for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {
|
||||
|
||||
/* simple optimization */
|
||||
|
||||
if (i == rank) {
|
||||
if (MPI_IN_PLACE != rbuf) {
|
||||
err =
|
||||
ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
|
||||
rdtype);
|
||||
}
|
||||
} else {
|
||||
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
|
||||
MCA_COLL_BASE_TAG_SCATTER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
}
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* copied function (with appropriate renaming) ends here */
|
@ -2,19 +2,19 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -25,8 +25,8 @@
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "coll_base_topo.h"
|
||||
|
||||
/*
|
||||
* Some static helpers.
|
||||
@ -75,36 +75,36 @@ static int calculate_num_nodes_up_to_level( int fanout, int level )
|
||||
*/
|
||||
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_tree( int fanout,
|
||||
ompi_coll_base_topo_build_tree( int fanout,
|
||||
struct ompi_communicator_t* comm,
|
||||
int root )
|
||||
{
|
||||
int rank, size, schild, sparent, shiftedrank, i;
|
||||
int level; /* location of my rank in the tree structure of size */
|
||||
int delta; /* number of nodes on my level */
|
||||
int slimit; /* total number of nodes on levels above me */
|
||||
int slimit; /* total number of nodes on levels above me */
|
||||
ompi_coll_tree_t* tree;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree Building fo %d rt %d", fanout, root));
|
||||
|
||||
if (fanout<1) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree invalid fanout %d", fanout));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree invalid fanout %d", fanout));
|
||||
return NULL;
|
||||
}
|
||||
if (fanout>MAXTREEFANOUT) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
*/
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
|
||||
if (!tree) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree PANIC::out of memory"));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree PANIC::out of memory"));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -115,8 +115,8 @@ ompi_coll_tuned_topo_build_tree( int fanout,
|
||||
* Set root
|
||||
*/
|
||||
tree->tree_root = root;
|
||||
|
||||
/*
|
||||
|
||||
/*
|
||||
* Initialize tree
|
||||
*/
|
||||
tree->tree_fanout = fanout;
|
||||
@ -132,11 +132,11 @@ ompi_coll_tuned_topo_build_tree( int fanout,
|
||||
if( size < 2 ) {
|
||||
return tree;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Shift all ranks by root, so that the algorithm can be
|
||||
* Shift all ranks by root, so that the algorithm can be
|
||||
* designed as if root would be always 0
|
||||
* shiftedrank should be used in calculating distances
|
||||
* shiftedrank should be used in calculating distances
|
||||
* and position in tree
|
||||
*/
|
||||
shiftedrank = rank - root;
|
||||
@ -158,7 +158,7 @@ ompi_coll_tuned_topo_build_tree( int fanout,
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* find my parent */
|
||||
slimit = calculate_num_nodes_up_to_level( fanout, level );
|
||||
sparent = shiftedrank;
|
||||
@ -170,12 +170,12 @@ ompi_coll_tuned_topo_build_tree( int fanout,
|
||||
}
|
||||
}
|
||||
tree->tree_prev = (sparent+root)%size;
|
||||
|
||||
|
||||
return tree;
|
||||
}
|
||||
|
||||
/*
|
||||
* Constructs in-order binary tree which can be used for non-commutative reduce
|
||||
* Constructs in-order binary tree which can be used for non-commutative reduce
|
||||
* operations.
|
||||
* Root of this tree is always rank (size-1) and fanout is 2.
|
||||
* Here are some of the examples of this tree:
|
||||
@ -189,28 +189,28 @@ ompi_coll_tuned_topo_build_tree( int fanout,
|
||||
* 4 0
|
||||
*/
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
||||
ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
||||
{
|
||||
int rank, size, myrank, rightsize, delta, parent, lchild, rchild;
|
||||
ompi_coll_tree_t* tree;
|
||||
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
*/
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
|
||||
if (!tree) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:topo_build_tree PANIC::out of memory"));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:topo_build_tree PANIC::out of memory"));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
tree->tree_root = MPI_UNDEFINED;
|
||||
tree->tree_nextsize = MPI_UNDEFINED;
|
||||
|
||||
/*
|
||||
/*
|
||||
* Initialize tree
|
||||
*/
|
||||
tree->tree_fanout = 2;
|
||||
@ -220,11 +220,11 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
||||
tree->tree_nextsize = 0;
|
||||
tree->tree_next[0] = -1;
|
||||
tree->tree_next[1] = -1;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:topo_build_in_order_tree Building fo %d rt %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:topo_build_in_order_tree Building fo %d rt %d",
|
||||
tree->tree_fanout, tree->tree_root));
|
||||
|
||||
/*
|
||||
/*
|
||||
* Build the tree
|
||||
*/
|
||||
myrank = rank;
|
||||
@ -240,18 +240,18 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
||||
rchild = -1;
|
||||
if (size - 1 > 0) {
|
||||
lchild = parent - 1;
|
||||
if (lchild > 0) {
|
||||
if (lchild > 0) {
|
||||
rchild = rightsize - 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* The following cases are possible: myrank can be
|
||||
|
||||
/* The following cases are possible: myrank can be
|
||||
- a parent,
|
||||
- belong to the left subtree, or
|
||||
- belong to the right subtee
|
||||
Each of the cases need to be handled differently.
|
||||
*/
|
||||
|
||||
|
||||
if (myrank == parent) {
|
||||
/* I am the parent:
|
||||
- compute real ranks of my children, and exit the loop. */
|
||||
@ -262,7 +262,7 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
||||
if (myrank > rchild) {
|
||||
/* I belong to the left subtree:
|
||||
- If I am the left child, compute real rank of my parent
|
||||
- Iterate down through tree:
|
||||
- Iterate down through tree:
|
||||
compute new size, shift ranks down, and update delta.
|
||||
*/
|
||||
if (myrank == lchild) {
|
||||
@ -276,8 +276,8 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
||||
} else {
|
||||
/* I belong to the right subtree:
|
||||
- If I am the right child, compute real rank of my parent
|
||||
- Iterate down through tree:
|
||||
compute new size and parent,
|
||||
- Iterate down through tree:
|
||||
compute new size and parent,
|
||||
but the delta and rank do not need to change.
|
||||
*/
|
||||
if (myrank == rchild) {
|
||||
@ -287,14 +287,14 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
||||
parent = rchild;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
|
||||
if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
|
||||
|
||||
return tree;
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
|
||||
int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree )
|
||||
{
|
||||
ompi_coll_tree_t *ptr;
|
||||
|
||||
@ -311,7 +311,7 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
*
|
||||
* Here are some of the examples of this tree:
|
||||
* size == 2 size = 4 size = 8
|
||||
* 0 0 0
|
||||
@ -323,16 +323,16 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
|
||||
* 7
|
||||
*/
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
int root )
|
||||
{
|
||||
int childs = 0, rank, size, mask = 1, index, remote, i;
|
||||
ompi_coll_tree_t *bmtree;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree rt %d", root));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree rt %d", root));
|
||||
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
*/
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
@ -341,7 +341,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
|
||||
bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
|
||||
if (!bmtree) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory"));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -372,7 +372,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
remote += root;
|
||||
if( remote >= size ) remote -= size;
|
||||
if (childs==MAXTREEFANOUT) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs));
|
||||
free(bmtree);
|
||||
return NULL;
|
||||
}
|
||||
@ -388,7 +388,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
/*
|
||||
* Constructs in-order binomial tree which can be used for gather/scatter
|
||||
* operations.
|
||||
*
|
||||
*
|
||||
* Here are some of the examples of this tree:
|
||||
* size == 2 size = 4 size = 8
|
||||
* 0 0 0
|
||||
@ -400,16 +400,16 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
* 7
|
||||
*/
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
||||
ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
||||
int root )
|
||||
{
|
||||
int childs = 0, rank, vrank, size, mask = 1, remote, i;
|
||||
ompi_coll_tree_t *bmtree;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_in_order_bmtree rt %d", root));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_in_order_bmtree rt %d", root));
|
||||
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
*/
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
@ -418,7 +418,7 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
||||
|
||||
bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
|
||||
if (!bmtree) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory"));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -442,10 +442,10 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
||||
bmtree->tree_next[childs] = (remote + root) % size;
|
||||
childs++;
|
||||
if (childs==MAXTREEFANOUT) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d",
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||
"coll:base:topo:build_bmtree max fanout incorrect %d needed %d",
|
||||
MAXTREEFANOUT, childs));
|
||||
free (bmtree);
|
||||
free(bmtree);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@ -459,36 +459,36 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
||||
|
||||
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_chain( int fanout,
|
||||
ompi_coll_base_topo_build_chain( int fanout,
|
||||
struct ompi_communicator_t* comm,
|
||||
int root )
|
||||
{
|
||||
int i, maxchainlen, mark, head, len, rank, size, srank /* shifted rank */;
|
||||
ompi_coll_tree_t *chain;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain fo %d rt %d", fanout, root));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain fo %d rt %d", fanout, root));
|
||||
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
/*
|
||||
* Get size and rank of the process in this communicator
|
||||
*/
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
if( fanout < 1 ) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!"));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!"));
|
||||
fanout = 1;
|
||||
}
|
||||
if (fanout>MAXTREEFANOUT) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT));
|
||||
fanout = MAXTREEFANOUT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate space for topology arrays if needed
|
||||
* Allocate space for topology arrays if needed
|
||||
*/
|
||||
chain = (ompi_coll_tree_t*)malloc( sizeof(ompi_coll_tree_t) );
|
||||
if (!chain) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain PANIC out of memory"));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain PANIC out of memory"));
|
||||
fflush(stdout);
|
||||
return NULL;
|
||||
}
|
||||
@ -496,17 +496,17 @@ ompi_coll_tuned_topo_build_chain( int fanout,
|
||||
chain->tree_nextsize = -1;
|
||||
for(i=0;i<fanout;i++) chain->tree_next[i] = -1;
|
||||
|
||||
/*
|
||||
/*
|
||||
* Set root & numchain
|
||||
*/
|
||||
chain->tree_root = root;
|
||||
if( (size - 1) < fanout ) {
|
||||
if( (size - 1) < fanout ) {
|
||||
chain->tree_nextsize = size-1;
|
||||
fanout = size-1;
|
||||
} else {
|
||||
chain->tree_nextsize = fanout;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Shift ranks
|
||||
*/
|
||||
@ -577,7 +577,7 @@ ompi_coll_tuned_topo_build_chain( int fanout,
|
||||
chain->tree_nextsize = 1;
|
||||
} else {
|
||||
chain->tree_next[0] = -1;
|
||||
chain->tree_nextsize = 0;
|
||||
chain->tree_nextsize = 0;
|
||||
}
|
||||
}
|
||||
chain->tree_prev = (chain->tree_prev+root)%size;
|
||||
@ -586,7 +586,7 @@ ompi_coll_tuned_topo_build_chain( int fanout,
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Unshift values
|
||||
* Unshift values
|
||||
*/
|
||||
chain->tree_prev = -1;
|
||||
chain->tree_next[0] = (root+1)%size;
|
||||
@ -603,17 +603,18 @@ ompi_coll_tuned_topo_build_chain( int fanout,
|
||||
return chain;
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
|
||||
int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
|
||||
{
|
||||
int i;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo:topo_dump_tree %1d tree root %d"
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo:topo_dump_tree %1d tree root %d"
|
||||
" fanout %d BM %1d nextsize %d prev %d",
|
||||
rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout,
|
||||
tree->tree_nextsize, tree->tree_prev));
|
||||
if( tree->tree_nextsize ) {
|
||||
for( i = 0; i < tree->tree_nextsize; i++ )
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d", i, tree->tree_next[i]));
|
||||
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"[%1d] %d", i, tree->tree_next[i]));
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
@ -2,22 +2,22 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2012 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED
|
||||
#define MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED
|
||||
#ifndef MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED
|
||||
#define MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
@ -35,29 +35,28 @@ typedef struct ompi_coll_tree_t {
|
||||
} ompi_coll_tree_t;
|
||||
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_tree( int fanout,
|
||||
ompi_coll_base_topo_build_tree( int fanout,
|
||||
struct ompi_communicator_t* com,
|
||||
int root );
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm );
|
||||
ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm );
|
||||
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
int root );
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
||||
ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
||||
int root );
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_chain( int fanout,
|
||||
ompi_coll_base_topo_build_chain( int fanout,
|
||||
struct ompi_communicator_t* com,
|
||||
int root );
|
||||
|
||||
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree );
|
||||
int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree );
|
||||
|
||||
/* debugging stuff, will be removed later */
|
||||
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
|
||||
int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED */
|
||||
|
||||
#endif /* MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED */
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -19,17 +19,17 @@
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "coll_tuned.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned_util.h"
|
||||
#include "coll_base_util.h"
|
||||
|
||||
int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
|
||||
int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
|
||||
ompi_datatype_t* sdatatype,
|
||||
int dest, int stag,
|
||||
void* recvbuf, size_t rcount,
|
||||
@ -91,14 +91,14 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
|
||||
*status = statuses[err_index];
|
||||
}
|
||||
err = statuses[err_index].MPI_ERROR;
|
||||
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s"
|
||||
" stage of ompi_coll_tuned_sendrecv_zero\n",
|
||||
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
|
||||
" stage of ompi_coll_base_sendrecv_zero\n",
|
||||
__FILE__, line, err, (0 == err_index ? "receive" : "send")));
|
||||
} else {
|
||||
/* Error discovered during the posting of the irecv or isend,
|
||||
* and no status is available.
|
||||
*/
|
||||
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n",
|
||||
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
|
||||
__FILE__, line, err));
|
||||
if (MPI_STATUS_IGNORE != status) {
|
||||
status->MPI_ERROR = err;
|
@ -2,24 +2,24 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef MCA_COLL_TUNED_UTIL_EXPORT_H
|
||||
#define MCA_COLL_TUNED_UTIL_EXPORT_H
|
||||
#ifndef MCA_COLL_BASE_UTIL_EXPORT_H
|
||||
#define MCA_COLL_BASE_UTIL_EXPORT_H
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
@ -36,10 +36,10 @@ BEGIN_C_DECLS
|
||||
* If one of the communications results in a zero-byte message the
|
||||
* communication is ignored, and no message will cross to the peer.
|
||||
*/
|
||||
int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
|
||||
int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
|
||||
ompi_datatype_t* sdatatype,
|
||||
int dest, int stag,
|
||||
void* recvbuf, size_t rcount,
|
||||
void* recvbuf, size_t rcount,
|
||||
ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
@ -53,24 +53,22 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
|
||||
* communications.
|
||||
*/
|
||||
static inline int
|
||||
ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype,
|
||||
ompi_coll_base_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype,
|
||||
int dest, int stag,
|
||||
void* recvbuf, size_t rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status, int myid )
|
||||
{
|
||||
if ((dest == source) && (source == myid)) {
|
||||
return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype,
|
||||
return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype,
|
||||
recvbuf, (int32_t) rcount, rdatatype);
|
||||
}
|
||||
return ompi_coll_tuned_sendrecv_nonzero_actual (sendbuf, scount, sdatatype,
|
||||
dest, stag,
|
||||
return ompi_coll_base_sendrecv_nonzero_actual (sendbuf, scount, sdatatype,
|
||||
dest, stag,
|
||||
recvbuf, rcount, rdatatype,
|
||||
source, rtag, comm, status);
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
#endif /* MCA_COLL_TUNED_UTIL_EXPORT_H */
|
||||
|
||||
|
||||
#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -31,6 +31,7 @@
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
@ -52,12 +53,6 @@ BEGIN_C_DECLS
|
||||
int mca_coll_basic_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm);
|
||||
|
||||
int mca_coll_basic_allgather_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
int mca_coll_basic_allgather_inter(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
@ -65,13 +60,6 @@ BEGIN_C_DECLS
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_allgatherv_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
int *disps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
int mca_coll_basic_allgatherv_inter(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
@ -91,12 +79,6 @@ BEGIN_C_DECLS
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_alltoall_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
int mca_coll_basic_alltoall_inter(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
@ -104,14 +86,6 @@ BEGIN_C_DECLS
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_alltoallv_intra(void *sbuf, int *scounts,
|
||||
int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
int mca_coll_basic_alltoallv_inter(void *sbuf, int *scounts,
|
||||
int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
@ -138,21 +112,12 @@ BEGIN_C_DECLS
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_barrier_intra_lin(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_barrier_inter_lin(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_barrier_intra_log(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_bcast_lin_intra(void *buff, int count,
|
||||
struct ompi_datatype_t *datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_bcast_lin_inter(void *buff, int count,
|
||||
struct ompi_datatype_t *datatype,
|
||||
int root,
|
||||
@ -183,13 +148,6 @@ BEGIN_C_DECLS
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_gather_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
int mca_coll_basic_gather_inter(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
@ -214,12 +172,6 @@ BEGIN_C_DECLS
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_reduce_lin_intra(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
int mca_coll_basic_reduce_lin_inter(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
@ -279,13 +231,6 @@ BEGIN_C_DECLS
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
|
||||
int mca_coll_basic_scatter_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module);
|
||||
int mca_coll_basic_scatter_inter(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -32,50 +32,6 @@
|
||||
#include "coll_basic.h"
|
||||
|
||||
|
||||
/*
|
||||
* allgather_intra
|
||||
*
|
||||
* Function: - allgather using other MPI collections
|
||||
* Accepts: - same as MPI_Allgather()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
mca_coll_basic_allgather_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype, void *rbuf,
|
||||
int rcount, struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int err;
|
||||
ptrdiff_t lb, extent;
|
||||
|
||||
/* Handle MPI_IN_PLACE (see explanantion in reduce.c for how to
|
||||
allocate temp buffer) -- note that rank 0 can use IN_PLACE
|
||||
natively, and we can just alias the right position in rbuf
|
||||
as sbuf and avoid using a temporary buffer if gather is
|
||||
implemented correctly */
|
||||
if (MPI_IN_PLACE == sbuf && 0 != ompi_comm_rank(comm)) {
|
||||
ompi_datatype_get_extent(rdtype, &lb, &extent);
|
||||
sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount);
|
||||
sdtype = rdtype;
|
||||
scount = rcount;
|
||||
}
|
||||
|
||||
/* Gather and broadcast. */
|
||||
|
||||
err = comm->c_coll.coll_gather(sbuf, scount, sdtype, rbuf, rcount,
|
||||
rdtype, 0, comm, comm->c_coll.coll_gather_module);
|
||||
if (MPI_SUCCESS == err) {
|
||||
err = comm->c_coll.coll_bcast(rbuf, rcount * ompi_comm_size(comm),
|
||||
rdtype, 0, comm, comm->c_coll.coll_bcast_module);
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* allgather_inter
|
||||
*
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -28,87 +28,6 @@
|
||||
#include "coll_basic.h"
|
||||
|
||||
|
||||
/*
|
||||
* allgatherv_intra
|
||||
*
|
||||
* Function: - allgatherv using other MPI collectives
|
||||
* Accepts: - same as MPI_Allgatherv()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
mca_coll_basic_allgatherv_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts, int *disps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, size, rank ;
|
||||
int err;
|
||||
MPI_Aint extent;
|
||||
MPI_Aint lb;
|
||||
char *send_buf = NULL;
|
||||
struct ompi_datatype_t *newtype, *send_type;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
/*
|
||||
* We don't have a root process defined. Arbitrarily assign root
|
||||
* to process with rank 0 (OMPI convention)
|
||||
*/
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
ompi_datatype_get_extent(rdtype, &lb, &extent);
|
||||
send_type = rdtype;
|
||||
send_buf = (char*)rbuf;
|
||||
for (i = 0; i < rank; ++i) {
|
||||
send_buf += (rcounts[i] * extent);
|
||||
}
|
||||
} else {
|
||||
send_buf = (char*)sbuf;
|
||||
send_type = sdtype;
|
||||
}
|
||||
|
||||
err = comm->c_coll.coll_gatherv(send_buf,
|
||||
rcounts[rank], send_type,rbuf,
|
||||
rcounts, disps, rdtype, 0,
|
||||
comm, comm->c_coll.coll_gatherv_module);
|
||||
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
/*
|
||||
* we now have all the data in the root's rbuf. Need to
|
||||
* broadcast the data out to the other processes
|
||||
*
|
||||
* Need to define a datatype that captures the different vectors
|
||||
* from each process. MPI_TYPE_INDEXED with params
|
||||
* size,rcount,displs,rdtype,newtype
|
||||
* should do the trick.
|
||||
* Use underlying ddt functions to create, and commit the
|
||||
* new datatype on each process, then broadcast and destroy the
|
||||
* datatype.
|
||||
*/
|
||||
|
||||
err = ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&newtype);
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
err = ompi_datatype_commit(&newtype);
|
||||
if(MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
err = comm->c_coll.coll_bcast( rbuf, 1 ,newtype,0,comm,
|
||||
comm->c_coll.coll_bcast_module);
|
||||
|
||||
ompi_datatype_destroy (&newtype);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* allgatherv_inter
|
||||
*
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -32,224 +32,6 @@
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
|
||||
|
||||
static int
|
||||
mca_coll_basic_alltoall_intra_inplace(void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
|
||||
int i, j, size, rank, err=MPI_SUCCESS;
|
||||
MPI_Request *preq;
|
||||
char *tmp_buffer;
|
||||
size_t max_size;
|
||||
ptrdiff_t ext;
|
||||
|
||||
/* Initialize. */
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
/* If only one process, we're done. */
|
||||
if (1 == size) {
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* Find the largest receive amount */
|
||||
ompi_datatype_type_extent (rdtype, &ext);
|
||||
max_size = ext * rcount;
|
||||
|
||||
/* Allocate a temporary buffer */
|
||||
tmp_buffer = calloc (max_size, 1);
|
||||
if (NULL == tmp_buffer) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* in-place alltoall slow algorithm (but works) */
|
||||
for (i = 0 ; i < size ; ++i) {
|
||||
for (j = i+1 ; j < size ; ++j) {
|
||||
/* Initiate all send/recv to/from others. */
|
||||
preq = basic_module->mccb_reqs;
|
||||
|
||||
if (i == rank) {
|
||||
/* Copy the data into the temporary buffer */
|
||||
err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
|
||||
(char *) rbuf + j * max_size);
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
/* Exchange data with the peer */
|
||||
err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype,
|
||||
j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype,
|
||||
j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, preq++));
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
} else if (j == rank) {
|
||||
/* Copy the data into the temporary buffer */
|
||||
err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
|
||||
(char *) rbuf + i * max_size);
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
/* Exchange data with the peer */
|
||||
err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype,
|
||||
i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype,
|
||||
i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, preq++));
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Wait for the requests to complete */
|
||||
err = ompi_request_wait_all (2, basic_module->mccb_reqs, MPI_STATUSES_IGNORE);
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
/* Free the requests. */
|
||||
mca_coll_basic_free_reqs(basic_module->mccb_reqs, 2);
|
||||
}
|
||||
}
|
||||
|
||||
error_hndl:
|
||||
/* Free the temporary buffer */
|
||||
free (tmp_buffer);
|
||||
|
||||
/* All done */
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* alltoall_intra
|
||||
*
|
||||
* Function: - MPI_Alltoall
|
||||
* Accepts: - same as MPI_Alltoall()
|
||||
* Returns: - MPI_SUCCESS or an MPI error code
|
||||
*/
|
||||
int
|
||||
mca_coll_basic_alltoall_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i;
|
||||
int rank;
|
||||
int size;
|
||||
int err;
|
||||
int nreqs;
|
||||
char *psnd;
|
||||
char *prcv;
|
||||
MPI_Aint lb;
|
||||
MPI_Aint sndinc;
|
||||
MPI_Aint rcvinc;
|
||||
|
||||
ompi_request_t **req;
|
||||
ompi_request_t **sreq;
|
||||
ompi_request_t **rreq;
|
||||
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
|
||||
|
||||
/* Initialize. */
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
return mca_coll_basic_alltoall_intra_inplace (rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
err = ompi_datatype_get_extent(sdtype, &lb, &sndinc);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
sndinc *= scount;
|
||||
|
||||
err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
rcvinc *= rcount;
|
||||
|
||||
/* simple optimization */
|
||||
|
||||
psnd = ((char *) sbuf) + (rank * sndinc);
|
||||
prcv = ((char *) rbuf) + (rank * rcvinc);
|
||||
|
||||
err = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
/* If only one process, we're done. */
|
||||
|
||||
if (1 == size) {
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* Initiate all send/recv to/from others. */
|
||||
|
||||
req = rreq = basic_module->mccb_reqs;
|
||||
sreq = rreq + size - 1;
|
||||
|
||||
prcv = (char *) rbuf;
|
||||
psnd = (char *) sbuf;
|
||||
|
||||
/* Post all receives first -- a simple optimization */
|
||||
|
||||
for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++rreq, ++nreqs) {
|
||||
err =
|
||||
MCA_PML_CALL(irecv_init
|
||||
(prcv + (i * rcvinc), rcount, rdtype, i,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
|
||||
if (MPI_SUCCESS != err) {
|
||||
mca_coll_basic_free_reqs(req, nreqs);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now post all sends */
|
||||
|
||||
for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++sreq, ++nreqs) {
|
||||
err =
|
||||
MCA_PML_CALL(isend_init
|
||||
(psnd + (i * sndinc), scount, sdtype, i,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
|
||||
if (MPI_SUCCESS != err) {
|
||||
mca_coll_basic_free_reqs(req, nreqs);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
nreqs = (size - 1) * 2;
|
||||
/* Start your engines. This will never return an error. */
|
||||
|
||||
MCA_PML_CALL(start(nreqs, req));
|
||||
|
||||
/* Wait for them all. If there's an error, note that we don't
|
||||
* care what the error was -- just that there *was* an error. The
|
||||
* PML will finish all requests, even if one or more of them fail.
|
||||
* i.e., by the end of this call, all the requests are free-able.
|
||||
* So free them anyway -- even if there was an error, and return
|
||||
* the error after we free everything. */
|
||||
|
||||
err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE);
|
||||
|
||||
/* Free the reqs */
|
||||
|
||||
mca_coll_basic_free_reqs(req, nreqs);
|
||||
|
||||
/* All done */
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* alltoall_inter
|
||||
*
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -33,226 +33,6 @@
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
|
||||
|
||||
static int
|
||||
mca_coll_basic_alltoallv_intra_inplace(void *rbuf, const int *rcounts, const int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
|
||||
int i, j, size, rank, err=MPI_SUCCESS;
|
||||
MPI_Request *preq;
|
||||
char *tmp_buffer;
|
||||
size_t max_size;
|
||||
ptrdiff_t ext;
|
||||
|
||||
/* Initialize. */
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
/* If only one process, we're done. */
|
||||
if (1 == size) {
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* Find the largest receive amount */
|
||||
ompi_datatype_type_extent (rdtype, &ext);
|
||||
for (i = 0, max_size = 0 ; i < size ; ++i) {
|
||||
size_t size = ext * rcounts[i];
|
||||
|
||||
max_size = size > max_size ? size : max_size;
|
||||
}
|
||||
|
||||
/* Allocate a temporary buffer */
|
||||
tmp_buffer = calloc (max_size, 1);
|
||||
if (NULL == tmp_buffer) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* in-place alltoallv slow algorithm (but works) */
|
||||
for (i = 0 ; i < size ; ++i) {
|
||||
for (j = i+1 ; j < size ; ++j) {
|
||||
/* Initiate all send/recv to/from others. */
|
||||
preq = basic_module->mccb_reqs;
|
||||
|
||||
if (i == rank && rcounts[j]) {
|
||||
/* Copy the data into the temporary buffer */
|
||||
err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j],
|
||||
tmp_buffer, (char *) rbuf + rdisps[j] * ext);
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
/* Exchange data with the peer */
|
||||
err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype,
|
||||
j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtype,
|
||||
j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, preq++));
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
} else if (j == rank && rcounts[i]) {
|
||||
/* Copy the data into the temporary buffer */
|
||||
err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i],
|
||||
tmp_buffer, (char *) rbuf + rdisps[i] * ext);
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
/* Exchange data with the peer */
|
||||
err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype,
|
||||
i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtype,
|
||||
i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, preq++));
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Wait for the requests to complete */
|
||||
err = ompi_request_wait_all (2, basic_module->mccb_reqs, MPI_STATUSES_IGNORE);
|
||||
if (MPI_SUCCESS != err) { goto error_hndl; }
|
||||
|
||||
/* Free the requests. */
|
||||
mca_coll_basic_free_reqs(basic_module->mccb_reqs, 2);
|
||||
}
|
||||
}
|
||||
|
||||
error_hndl:
|
||||
/* Free the temporary buffer */
|
||||
free (tmp_buffer);
|
||||
|
||||
/* All done */
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* alltoallv_intra
|
||||
*
|
||||
* Function: - MPI_Alltoallv
|
||||
* Accepts: - same as MPI_Alltoallv()
|
||||
* Returns: - MPI_SUCCESS or an MPI error code
|
||||
*/
|
||||
int
|
||||
mca_coll_basic_alltoallv_intra(void *sbuf, int *scounts, int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i;
|
||||
int size;
|
||||
int rank;
|
||||
int err;
|
||||
char *psnd;
|
||||
char *prcv;
|
||||
int nreqs;
|
||||
MPI_Aint sndextent;
|
||||
MPI_Aint rcvextent;
|
||||
MPI_Request *preq;
|
||||
|
||||
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
|
||||
|
||||
/* Initialize. */
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
return mca_coll_basic_alltoallv_intra_inplace (rbuf, rcounts, rdisps,
|
||||
rdtype, comm, module);
|
||||
}
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
ompi_datatype_type_extent(sdtype, &sndextent);
|
||||
ompi_datatype_type_extent(rdtype, &rcvextent);
|
||||
|
||||
/* simple optimization */
|
||||
|
||||
psnd = ((char *) sbuf) + (sdisps[rank] * sndextent);
|
||||
prcv = ((char *) rbuf) + (rdisps[rank] * rcvextent);
|
||||
|
||||
if (0 != scounts[rank]) {
|
||||
err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype,
|
||||
prcv, rcounts[rank], rdtype);
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* If only one process, we're done. */
|
||||
|
||||
if (1 == size) {
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* Initiate all send/recv to/from others. */
|
||||
|
||||
nreqs = 0;
|
||||
preq = basic_module->mccb_reqs;
|
||||
|
||||
/* Post all receives first -- a simple optimization */
|
||||
|
||||
for (i = 0; i < size; ++i) {
|
||||
if (i == rank || 0 == rcounts[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
prcv = ((char *) rbuf) + (rdisps[i] * rcvextent);
|
||||
err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtype,
|
||||
i, MCA_COLL_BASE_TAG_ALLTOALLV, comm,
|
||||
preq++));
|
||||
++nreqs;
|
||||
if (MPI_SUCCESS != err) {
|
||||
mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now post all sends */
|
||||
|
||||
for (i = 0; i < size; ++i) {
|
||||
if (i == rank || 0 == scounts[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
psnd = ((char *) sbuf) + (sdisps[i] * sndextent);
|
||||
err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtype,
|
||||
i, MCA_COLL_BASE_TAG_ALLTOALLV,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||
preq++));
|
||||
++nreqs;
|
||||
if (MPI_SUCCESS != err) {
|
||||
mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Start your engines. This will never return an error. */
|
||||
|
||||
MCA_PML_CALL(start(nreqs, basic_module->mccb_reqs));
|
||||
|
||||
/* Wait for them all. If there's an error, note that we don't care
|
||||
* what the error was -- just that there *was* an error. The PML
|
||||
* will finish all requests, even if one or more of them fail.
|
||||
* i.e., by the end of this call, all the requests are free-able.
|
||||
* So free them anyway -- even if there was an error, and return the
|
||||
* error after we free everything. */
|
||||
|
||||
err = ompi_request_wait_all(nreqs, basic_module->mccb_reqs,
|
||||
MPI_STATUSES_IGNORE);
|
||||
|
||||
/* Free the requests. */
|
||||
|
||||
mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs);
|
||||
|
||||
/* All done */
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* alltoallv_inter
|
||||
*
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -30,72 +30,6 @@
|
||||
#include "coll_basic.h"
|
||||
|
||||
|
||||
/*
|
||||
* barrier_intra_lin
|
||||
*
|
||||
* Function: - barrier using O(N) algorithm
|
||||
* Accepts: - same as MPI_Barrier()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
mca_coll_basic_barrier_intra_lin(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i;
|
||||
int err;
|
||||
int size = ompi_comm_size(comm);
|
||||
int rank = ompi_comm_rank(comm);
|
||||
|
||||
/* All non-root send & receive zero-length message. */
|
||||
|
||||
if (rank > 0) {
|
||||
err =
|
||||
MCA_PML_CALL(send
|
||||
(NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
err =
|
||||
MCA_PML_CALL(recv
|
||||
(NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* The root collects and broadcasts the messages. */
|
||||
|
||||
else {
|
||||
for (i = 1; i < size; ++i) {
|
||||
err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 1; i < size; ++i) {
|
||||
err =
|
||||
MCA_PML_CALL(send
|
||||
(NULL, 0, MPI_BYTE, i,
|
||||
MCA_COLL_BASE_TAG_BARRIER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* barrier_intra_log
|
||||
*
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -30,78 +30,6 @@
|
||||
#include "opal/util/bit_ops.h"
|
||||
|
||||
|
||||
/*
|
||||
* bcast_lin_intra
|
||||
*
|
||||
* Function: - broadcast using O(N) algorithm
|
||||
* Accepts: - same arguments as MPI_Bcast()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
mca_coll_basic_bcast_lin_intra(void *buff, int count,
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i;
|
||||
int size;
|
||||
int rank;
|
||||
int err;
|
||||
ompi_request_t **preq;
|
||||
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
|
||||
ompi_request_t **reqs = basic_module->mccb_reqs;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
/* Non-root receive the data. */
|
||||
|
||||
if (rank != root) {
|
||||
return MCA_PML_CALL(recv(buff, count, datatype, root,
|
||||
MCA_COLL_BASE_TAG_BCAST, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
}
|
||||
|
||||
/* Root sends data to all others. */
|
||||
|
||||
for (i = 0, preq = reqs; i < size; ++i) {
|
||||
if (i == rank) {
|
||||
continue;
|
||||
}
|
||||
|
||||
err = MCA_PML_CALL(isend_init(buff, count, datatype, i,
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm, preq++));
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
--i;
|
||||
|
||||
/* Start your engines. This will never return an error. */
|
||||
|
||||
MCA_PML_CALL(start(i, reqs));
|
||||
|
||||
/* Wait for them all. If there's an error, note that we don't
|
||||
* care what the error was -- just that there *was* an error. The
|
||||
* PML will finish all requests, even if one or more of them fail.
|
||||
* i.e., by the end of this call, all the requests are free-able.
|
||||
* So free them anyway -- even if there was an error, and return
|
||||
* the error after we free everything. */
|
||||
|
||||
err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE);
|
||||
|
||||
/* Free the reqs */
|
||||
|
||||
mca_coll_basic_free_reqs(reqs, i);
|
||||
|
||||
/* All done */
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* bcast_log_intra
|
||||
*
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -27,68 +27,6 @@
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
|
||||
/*
|
||||
* gather_intra
|
||||
*
|
||||
* Function: - basic gather operation
|
||||
* Accepts: - same arguments as MPI_Gather()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
mca_coll_basic_gather_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i;
|
||||
int err;
|
||||
int rank;
|
||||
int size;
|
||||
char *ptmp;
|
||||
MPI_Aint incr;
|
||||
MPI_Aint extent;
|
||||
MPI_Aint lb;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
/* Everyone but root sends data and returns. */
|
||||
|
||||
if (rank != root) {
|
||||
return MCA_PML_CALL(send(sbuf, scount, sdtype, root,
|
||||
MCA_COLL_BASE_TAG_GATHER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
}
|
||||
|
||||
/* I am the root, loop receiving the data. */
|
||||
|
||||
ompi_datatype_get_extent(rdtype, &lb, &extent);
|
||||
incr = extent * rcount;
|
||||
for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) {
|
||||
if (i == rank) {
|
||||
if (MPI_IN_PLACE != sbuf) {
|
||||
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||
ptmp, rcount, rdtype);
|
||||
} else {
|
||||
err = MPI_SUCCESS;
|
||||
}
|
||||
} else {
|
||||
err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, i,
|
||||
MCA_COLL_BASE_TAG_GATHER,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
}
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* gather_inter
|
||||
|
@ -3,10 +3,10 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
@ -129,40 +129,40 @@ mca_coll_basic_comm_query(struct ompi_communicator_t *comm,
|
||||
basic_module->super.coll_scatter = mca_coll_basic_scatter_inter;
|
||||
basic_module->super.coll_scatterv = mca_coll_basic_scatterv_inter;
|
||||
} else if (ompi_comm_size(comm) <= mca_coll_basic_crossover) {
|
||||
basic_module->super.coll_allgather = mca_coll_basic_allgather_intra;
|
||||
basic_module->super.coll_allgatherv = mca_coll_basic_allgatherv_intra;
|
||||
basic_module->super.coll_allgather = ompi_coll_base_allgather_intra_basic_linear;
|
||||
basic_module->super.coll_allgatherv = ompi_coll_base_allgatherv_intra_basic_default;
|
||||
basic_module->super.coll_allreduce = mca_coll_basic_allreduce_intra;
|
||||
basic_module->super.coll_alltoall = mca_coll_basic_alltoall_intra;
|
||||
basic_module->super.coll_alltoallv = mca_coll_basic_alltoallv_intra;
|
||||
basic_module->super.coll_alltoall = ompi_coll_base_alltoall_intra_basic_linear;
|
||||
basic_module->super.coll_alltoallv = ompi_coll_base_alltoallv_intra_basic_linear;
|
||||
basic_module->super.coll_alltoallw = mca_coll_basic_alltoallw_intra;
|
||||
basic_module->super.coll_barrier = mca_coll_basic_barrier_intra_lin;
|
||||
basic_module->super.coll_bcast = mca_coll_basic_bcast_lin_intra;
|
||||
basic_module->super.coll_barrier = ompi_coll_base_barrier_intra_basic_linear;
|
||||
basic_module->super.coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
|
||||
basic_module->super.coll_exscan = mca_coll_basic_exscan_intra;
|
||||
basic_module->super.coll_gather = mca_coll_basic_gather_intra;
|
||||
basic_module->super.coll_gather = ompi_coll_base_gather_intra_basic_linear;
|
||||
basic_module->super.coll_gatherv = mca_coll_basic_gatherv_intra;
|
||||
basic_module->super.coll_reduce = mca_coll_basic_reduce_lin_intra;
|
||||
basic_module->super.coll_reduce = ompi_coll_base_reduce_intra_basic_linear;
|
||||
basic_module->super.coll_reduce_scatter_block = mca_coll_basic_reduce_scatter_block_intra;
|
||||
basic_module->super.coll_reduce_scatter = mca_coll_basic_reduce_scatter_intra;
|
||||
basic_module->super.coll_scan = mca_coll_basic_scan_intra;
|
||||
basic_module->super.coll_scatter = mca_coll_basic_scatter_intra;
|
||||
basic_module->super.coll_scatter = ompi_coll_base_scatter_intra_basic_linear;
|
||||
basic_module->super.coll_scatterv = mca_coll_basic_scatterv_intra;
|
||||
} else {
|
||||
basic_module->super.coll_allgather = mca_coll_basic_allgather_intra;
|
||||
basic_module->super.coll_allgatherv = mca_coll_basic_allgatherv_intra;
|
||||
basic_module->super.coll_allgather = ompi_coll_base_allgather_intra_basic_linear;
|
||||
basic_module->super.coll_allgatherv = ompi_coll_base_allgatherv_intra_basic_default;
|
||||
basic_module->super.coll_allreduce = mca_coll_basic_allreduce_intra;
|
||||
basic_module->super.coll_alltoall = mca_coll_basic_alltoall_intra;
|
||||
basic_module->super.coll_alltoallv = mca_coll_basic_alltoallv_intra;
|
||||
basic_module->super.coll_alltoall = ompi_coll_base_alltoall_intra_basic_linear;
|
||||
basic_module->super.coll_alltoallv = ompi_coll_base_alltoallv_intra_basic_linear;
|
||||
basic_module->super.coll_alltoallw = mca_coll_basic_alltoallw_intra;
|
||||
basic_module->super.coll_barrier = mca_coll_basic_barrier_intra_log;
|
||||
basic_module->super.coll_bcast = mca_coll_basic_bcast_log_intra;
|
||||
basic_module->super.coll_exscan = mca_coll_basic_exscan_intra;
|
||||
basic_module->super.coll_gather = mca_coll_basic_gather_intra;
|
||||
basic_module->super.coll_gather = ompi_coll_base_gather_intra_basic_linear;
|
||||
basic_module->super.coll_gatherv = mca_coll_basic_gatherv_intra;
|
||||
basic_module->super.coll_reduce = mca_coll_basic_reduce_log_intra;
|
||||
basic_module->super.coll_reduce_scatter_block = mca_coll_basic_reduce_scatter_block_intra;
|
||||
basic_module->super.coll_reduce_scatter = mca_coll_basic_reduce_scatter_intra;
|
||||
basic_module->super.coll_scan = mca_coll_basic_scan_intra;
|
||||
basic_module->super.coll_scatter = mca_coll_basic_scatter_intra;
|
||||
basic_module->super.coll_scatter = ompi_coll_base_scatter_intra_basic_linear;
|
||||
basic_module->super.coll_scatterv = mca_coll_basic_scatterv_intra;
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -28,241 +28,6 @@
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/op/op.h"
|
||||
|
||||
/*
|
||||
* reduce_lin_intra
|
||||
*
|
||||
* Function: - reduction using O(N) algorithm
|
||||
* Accepts: - same as MPI_Reduce()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
mca_coll_basic_reduce_lin_intra(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, rank, err, size;
|
||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||
char *free_buffer = NULL;
|
||||
char *pml_buffer = NULL;
|
||||
char *inplace_temp = NULL;
|
||||
char *inbuf;
|
||||
|
||||
/* Initialize */
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
|
||||
/* If not root, send data to the root. */
|
||||
|
||||
if (rank != root) {
|
||||
err = MCA_PML_CALL(send(sbuf, count, dtype, root,
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Root receives and reduces messages. Allocate buffer to receive
|
||||
* messages. This comment applies to all collectives in this basic
|
||||
* module where we allocate a temporary buffer. For the next few
|
||||
* lines of code, it's tremendously complicated how we decided that
|
||||
* this was the Right Thing to do. Sit back and enjoy. And prepare
|
||||
* to have your mind warped. :-)
|
||||
*
|
||||
* Recall some definitions (I always get these backwards, so I'm
|
||||
* going to put them here):
|
||||
*
|
||||
* extent: the length from the lower bound to the upper bound -- may
|
||||
* be considerably larger than the buffer required to hold the data
|
||||
* (or smaller! But it's easiest to think about when it's larger).
|
||||
*
|
||||
* true extent: the exact number of bytes required to hold the data
|
||||
* in the layout pattern in the datatype.
|
||||
*
|
||||
* For example, consider the following buffer (just talking about
|
||||
* true_lb, extent, and true extent -- extrapolate for true_ub:
|
||||
*
|
||||
* A B C
|
||||
* --------------------------------------------------------
|
||||
* | | |
|
||||
* --------------------------------------------------------
|
||||
*
|
||||
* There are multiple cases:
|
||||
*
|
||||
* 1. A is what we give to MPI_Send (and friends), and A is where
|
||||
* the data starts, and C is where the data ends. In this case:
|
||||
*
|
||||
* - extent: C-A
|
||||
* - true extent: C-A
|
||||
* - true_lb: 0
|
||||
*
|
||||
* A C
|
||||
* --------------------------------------------------------
|
||||
* | |
|
||||
* --------------------------------------------------------
|
||||
* <=======================extent=========================>
|
||||
* <======================true extent=====================>
|
||||
*
|
||||
* 2. A is what we give to MPI_Send (and friends), B is where the
|
||||
* data starts, and C is where the data ends. In this case:
|
||||
*
|
||||
* - extent: C-A
|
||||
* - true extent: C-B
|
||||
* - true_lb: positive
|
||||
*
|
||||
* A B C
|
||||
* --------------------------------------------------------
|
||||
* | | User buffer |
|
||||
* --------------------------------------------------------
|
||||
* <=======================extent=========================>
|
||||
* <===============true extent=============>
|
||||
*
|
||||
* 3. B is what we give to MPI_Send (and friends), A is where the
|
||||
* data starts, and C is where the data ends. In this case:
|
||||
*
|
||||
* - extent: C-A
|
||||
* - true extent: C-A
|
||||
* - true_lb: negative
|
||||
*
|
||||
* A B C
|
||||
* --------------------------------------------------------
|
||||
* | | User buffer |
|
||||
* --------------------------------------------------------
|
||||
* <=======================extent=========================>
|
||||
* <======================true extent=====================>
|
||||
*
|
||||
* 4. MPI_BOTTOM is what we give to MPI_Send (and friends), B is
|
||||
* where the data starts, and C is where the data ends. In this
|
||||
* case:
|
||||
*
|
||||
* - extent: C-MPI_BOTTOM
|
||||
* - true extent: C-B
|
||||
* - true_lb: [potentially very large] positive
|
||||
*
|
||||
* MPI_BOTTOM B C
|
||||
* --------------------------------------------------------
|
||||
* | | User buffer |
|
||||
* --------------------------------------------------------
|
||||
* <=======================extent=========================>
|
||||
* <===============true extent=============>
|
||||
*
|
||||
* So in all cases, for a temporary buffer, all we need to malloc()
|
||||
* is a buffer of size true_extent. We therefore need to know two
|
||||
* pointer values: what value to give to MPI_Send (and friends) and
|
||||
* what value to give to free(), because they might not be the same.
|
||||
*
|
||||
* Clearly, what we give to free() is exactly what was returned from
|
||||
* malloc(). That part is easy. :-)
|
||||
*
|
||||
* What we give to MPI_Send (and friends) is a bit more complicated.
|
||||
* Let's take the 4 cases from above:
|
||||
*
|
||||
* 1. If A is what we give to MPI_Send and A is where the data
|
||||
* starts, then clearly we give to MPI_Send what we got back from
|
||||
* malloc().
|
||||
*
|
||||
* 2. If B is what we get back from malloc, but we give A to
|
||||
* MPI_Send, then the buffer range [A,B) represents "dead space"
|
||||
* -- no data will be put there. So it's safe to give B-true_lb to
|
||||
* MPI_Send. More specifically, the true_lb is positive, so B-true_lb is
|
||||
* actually A.
|
||||
*
|
||||
* 3. If A is what we get back from malloc, and B is what we give to
|
||||
* MPI_Send, then the true_lb is negative, so A-true_lb will actually equal
|
||||
* B.
|
||||
*
|
||||
* 4. Although this seems like the weirdest case, it's actually
|
||||
* quite similar to case #2 -- the pointer we give to MPI_Send is
|
||||
* smaller than the pointer we got back from malloc().
|
||||
*
|
||||
* Hence, in all cases, we give (return_from_malloc - true_lb) to MPI_Send.
|
||||
*
|
||||
* This works fine and dandy if we only have (count==1), which we
|
||||
* rarely do. ;-) So we really need to allocate (true_extent +
|
||||
* ((count - 1) * extent)) to get enough space for the rest. This may
|
||||
* be more than is necessary, but it's ok.
|
||||
*
|
||||
* Simple, no? :-)
|
||||
*
|
||||
*/
|
||||
|
||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||
ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
|
||||
|
||||
if (MPI_IN_PLACE == sbuf) {
|
||||
sbuf = rbuf;
|
||||
inplace_temp = (char*)malloc(true_extent + (count - 1) * extent);
|
||||
if (NULL == inplace_temp) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
rbuf = inplace_temp - true_lb;
|
||||
}
|
||||
|
||||
if (size > 1) {
|
||||
free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
|
||||
if (NULL == free_buffer) {
|
||||
if (NULL != inplace_temp) {
|
||||
free(inplace_temp);
|
||||
}
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
pml_buffer = free_buffer - true_lb;
|
||||
}
|
||||
|
||||
/* Initialize the receive buffer. */
|
||||
|
||||
if (rank == (size - 1)) {
|
||||
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
|
||||
} else {
|
||||
err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
}
|
||||
if (MPI_SUCCESS != err) {
|
||||
if (NULL != free_buffer) {
|
||||
free(free_buffer);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Loop receiving and calling reduction function (C or Fortran). */
|
||||
|
||||
for (i = size - 2; i >= 0; --i) {
|
||||
if (rank == i) {
|
||||
inbuf = (char*)sbuf;
|
||||
} else {
|
||||
err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i,
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE));
|
||||
if (MPI_SUCCESS != err) {
|
||||
if (NULL != free_buffer) {
|
||||
free(free_buffer);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
inbuf = pml_buffer;
|
||||
}
|
||||
|
||||
/* Perform the reduction */
|
||||
|
||||
ompi_op_reduce(op, inbuf, rbuf, count, dtype);
|
||||
}
|
||||
|
||||
if (NULL != inplace_temp) {
|
||||
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp);
|
||||
free(inplace_temp);
|
||||
}
|
||||
if (NULL != free_buffer) {
|
||||
free(free_buffer);
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* reduce_log_intra
|
||||
@ -339,8 +104,8 @@ mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count,
|
||||
* operations. */
|
||||
|
||||
if (!ompi_op_is_commute(op)) {
|
||||
return mca_coll_basic_reduce_lin_intra(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module);
|
||||
return ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module);
|
||||
}
|
||||
|
||||
/* Some variables */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -28,73 +28,6 @@
|
||||
#include "coll_basic.h"
|
||||
|
||||
|
||||
/*
|
||||
* scatter_intra
|
||||
*
|
||||
* Function: - scatter operation
|
||||
* Accepts: - same arguments as MPI_Scatter()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
mca_coll_basic_scatter_intra(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, rank, size, err;
|
||||
char *ptmp;
|
||||
ptrdiff_t lb, incr;
|
||||
|
||||
/* Initialize */
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
|
||||
/* If not root, receive data. */
|
||||
|
||||
if (rank != root) {
|
||||
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
|
||||
MCA_COLL_BASE_TAG_SCATTER,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
return err;
|
||||
}
|
||||
|
||||
/* I am the root, loop sending data. */
|
||||
|
||||
err = ompi_datatype_get_extent(sdtype, &lb, &incr);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
incr *= scount;
|
||||
for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {
|
||||
|
||||
/* simple optimization */
|
||||
|
||||
if (i == rank) {
|
||||
if (MPI_IN_PLACE != rbuf) {
|
||||
err =
|
||||
ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
|
||||
rdtype);
|
||||
}
|
||||
} else {
|
||||
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
|
||||
MCA_COLL_BASE_TAG_SCATTER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
}
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* scatter_inter
|
||||
*
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -470,6 +470,9 @@ struct mca_coll_base_module_2_1_0_t {
|
||||
be used for the given communicator */
|
||||
mca_coll_base_module_disable_1_1_0_fn_t coll_module_disable;
|
||||
|
||||
/** Data storage for all the algorithms defined in the base. Should
|
||||
not be used by other modules */
|
||||
struct mca_coll_base_comm_t* base_data;
|
||||
};
|
||||
typedef struct mca_coll_base_module_2_1_0_t mca_coll_base_module_2_1_0_t;
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
# Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -19,29 +19,25 @@
|
||||
|
||||
sources = \
|
||||
coll_tuned.h \
|
||||
coll_tuned_topo.h \
|
||||
coll_tuned_util.h \
|
||||
coll_tuned_dynamic_file.h \
|
||||
coll_tuned_dynamic_rules.h \
|
||||
coll_tuned_topo.c \
|
||||
coll_tuned_util.c \
|
||||
coll_tuned_decision_fixed.c \
|
||||
coll_tuned_decision_dynamic.c \
|
||||
coll_tuned_dynamic_file.c \
|
||||
coll_tuned_dynamic_rules.c \
|
||||
coll_tuned_allreduce.c \
|
||||
coll_tuned_alltoall.c \
|
||||
coll_tuned_alltoallv.c \
|
||||
coll_tuned_allgather.c \
|
||||
coll_tuned_allgatherv.c \
|
||||
coll_tuned_barrier.c \
|
||||
coll_tuned_bcast.c \
|
||||
coll_tuned_reduce.c \
|
||||
coll_tuned_reduce_scatter.c \
|
||||
coll_tuned_gather.c \
|
||||
coll_tuned_scatter.c \
|
||||
coll_tuned_component.c \
|
||||
coll_tuned_module.c
|
||||
coll_tuned_module.c \
|
||||
coll_tuned_allgather_decision.c \
|
||||
coll_tuned_allgatherv_decision.c \
|
||||
coll_tuned_allreduce_decision.c \
|
||||
coll_tuned_alltoall_decision.c \
|
||||
coll_tuned_gather_decision.c \
|
||||
coll_tuned_alltoallv_decision.c \
|
||||
coll_tuned_barrier_decision.c \
|
||||
coll_tuned_reduce_decision.c \
|
||||
coll_tuned_bcast_decision.c \
|
||||
coll_tuned_reduce_scatter_decision.c \
|
||||
coll_tuned_scatter_decision.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
|
@ -1,19 +1,8 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -28,61 +17,17 @@
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/request/request.h"
|
||||
|
||||
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
|
||||
/* also need the dynamic rule structures */
|
||||
#include "coll_tuned_dynamic_rules.h"
|
||||
|
||||
/* some fixed value index vars to simplify certain operations */
|
||||
typedef enum COLLTYPE {
|
||||
ALLGATHER = 0, /* 0 */
|
||||
ALLGATHERV, /* 1 */
|
||||
ALLREDUCE, /* 2 */
|
||||
ALLTOALL, /* 3 */
|
||||
ALLTOALLV, /* 4 */
|
||||
ALLTOALLW, /* 5 */
|
||||
BARRIER, /* 6 */
|
||||
BCAST, /* 7 */
|
||||
EXSCAN, /* 8 */
|
||||
GATHER, /* 9 */
|
||||
GATHERV, /* 10 */
|
||||
REDUCE, /* 11 */
|
||||
REDUCESCATTER, /* 12 */
|
||||
SCAN, /* 13 */
|
||||
SCATTER, /* 14 */
|
||||
SCATTERV, /* 15 */
|
||||
COLLCOUNT /* 16 end counter keep it as last element */
|
||||
} COLLTYPE_T;
|
||||
|
||||
/* defined arg lists to simply auto inclusion of user overriding decision functions */
|
||||
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define BARRIER_ARGS struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
|
||||
/* end defined arg lists to simply auto inclusion of user overriding decision functions */
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* these are the same across all modules and are loaded at component query time */
|
||||
extern int ompi_coll_tuned_stream;
|
||||
extern int ompi_coll_tuned_priority;
|
||||
extern int ompi_coll_tuned_preallocate_memory_comm_size_limit;
|
||||
extern bool ompi_coll_tuned_use_dynamic_rules;
|
||||
extern char* ompi_coll_tuned_dynamic_rules_filename;
|
||||
extern int ompi_coll_tuned_init_tree_fanout;
|
||||
@ -148,12 +93,6 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS);
|
||||
int ompi_coll_tuned_allgather_intra_do_forced(ALLGATHER_ARGS);
|
||||
int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize);
|
||||
int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_allgather_intra_bruck(ALLGATHER_ARGS);
|
||||
int ompi_coll_tuned_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
|
||||
int ompi_coll_tuned_allgather_intra_ring(ALLGATHER_ARGS);
|
||||
int ompi_coll_tuned_allgather_intra_neighborexchange(ALLGATHER_ARGS);
|
||||
int ompi_coll_tuned_allgather_intra_basic_linear(ALLGATHER_ARGS);
|
||||
int ompi_coll_tuned_allgather_intra_two_procs(ALLGATHER_ARGS);
|
||||
int ompi_coll_tuned_allgather_inter_dec_fixed(ALLGATHER_ARGS);
|
||||
int ompi_coll_tuned_allgather_inter_dec_dynamic(ALLGATHER_ARGS);
|
||||
|
||||
@ -163,11 +102,6 @@ int ompi_coll_tuned_allgatherv_intra_dec_dynamic(ALLGATHERV_ARGS);
|
||||
int ompi_coll_tuned_allgatherv_intra_do_forced(ALLGATHERV_ARGS);
|
||||
int ompi_coll_tuned_allgatherv_intra_do_this(ALLGATHERV_ARGS, int algorithm, int faninout, int segsize);
|
||||
int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_allgatherv_intra_bruck(ALLGATHERV_ARGS);
|
||||
int ompi_coll_tuned_allgatherv_intra_ring(ALLGATHERV_ARGS);
|
||||
int ompi_coll_tuned_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
|
||||
int ompi_coll_tuned_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
|
||||
int ompi_coll_tuned_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
|
||||
int ompi_coll_tuned_allgatherv_inter_dec_fixed(ALLGATHERV_ARGS);
|
||||
int ompi_coll_tuned_allgatherv_inter_dec_dynamic(ALLGATHERV_ARGS);
|
||||
|
||||
@ -177,11 +111,6 @@ int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
|
||||
int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS);
|
||||
int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
|
||||
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
|
||||
int ompi_coll_tuned_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
|
||||
int ompi_coll_tuned_allreduce_intra_ring(ALLREDUCE_ARGS);
|
||||
int ompi_coll_tuned_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
|
||||
int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
|
||||
int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS);
|
||||
int ompi_coll_tuned_allreduce_inter_dec_dynamic(ALLREDUCE_ARGS);
|
||||
|
||||
@ -191,11 +120,6 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS);
|
||||
int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS);
|
||||
int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize, int max_requests);
|
||||
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS);
|
||||
int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS);
|
||||
int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS);
|
||||
int ompi_coll_tuned_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
|
||||
int ompi_coll_tuned_alltoall_intra_two_procs(ALLTOALL_ARGS);
|
||||
int ompi_coll_tuned_alltoall_inter_dec_fixed(ALLTOALL_ARGS);
|
||||
int ompi_coll_tuned_alltoall_inter_dec_dynamic(ALLTOALL_ARGS);
|
||||
|
||||
@ -205,8 +129,6 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(ALLTOALLV_ARGS);
|
||||
int ompi_coll_tuned_alltoallv_intra_do_forced(ALLTOALLV_ARGS);
|
||||
int ompi_coll_tuned_alltoallv_intra_do_this(ALLTOALLV_ARGS, int algorithm);
|
||||
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_alltoallv_intra_pairwise(ALLTOALLV_ARGS);
|
||||
int ompi_coll_tuned_alltoallv_intra_basic_linear(ALLTOALLV_ARGS);
|
||||
int ompi_coll_tuned_alltoallv_inter_dec_fixed(ALLTOALLV_ARGS);
|
||||
int ompi_coll_tuned_alltoallv_inter_dec_dynamic(ALLTOALLV_ARGS);
|
||||
|
||||
@ -224,12 +146,6 @@ int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int fanin
|
||||
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS);
|
||||
int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS);
|
||||
int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS);
|
||||
int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS);
|
||||
int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS);
|
||||
int ompi_coll_tuned_barrier_intra_two_procs(BARRIER_ARGS);
|
||||
int ompi_coll_tuned_barrier_intra_linear(BARRIER_ARGS);
|
||||
int ompi_coll_tuned_barrier_intra_tree(BARRIER_ARGS);
|
||||
|
||||
/* Bcast */
|
||||
int ompi_coll_tuned_bcast_intra_generic( BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree );
|
||||
@ -238,12 +154,6 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS);
|
||||
int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS);
|
||||
int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize);
|
||||
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS);
|
||||
int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
|
||||
int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
|
||||
int ompi_coll_tuned_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
|
||||
int ompi_coll_tuned_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
|
||||
int ompi_coll_tuned_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
|
||||
int ompi_coll_tuned_bcast_inter_dec_fixed(BCAST_ARGS);
|
||||
int ompi_coll_tuned_bcast_inter_dec_dynamic(BCAST_ARGS);
|
||||
|
||||
@ -259,9 +169,6 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS);
|
||||
int ompi_coll_tuned_gather_intra_do_forced(GATHER_ARGS);
|
||||
int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize);
|
||||
int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_gather_intra_basic_linear(GATHER_ARGS);
|
||||
int ompi_coll_tuned_gather_intra_binomial(GATHER_ARGS);
|
||||
int ompi_coll_tuned_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
|
||||
int ompi_coll_tuned_gather_inter_dec_fixed(GATHER_ARGS);
|
||||
int ompi_coll_tuned_gather_inter_dec_dynamic(GATHER_ARGS);
|
||||
|
||||
@ -278,12 +185,6 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
|
||||
int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
|
||||
int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs);
|
||||
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
|
||||
int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
|
||||
int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||
int ompi_coll_tuned_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||
int ompi_coll_tuned_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||
int ompi_coll_tuned_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
|
||||
int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS);
|
||||
int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS);
|
||||
|
||||
@ -293,10 +194,6 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
|
||||
int ompi_coll_tuned_reduce_scatter_intra_do_forced(REDUCESCATTER_ARGS);
|
||||
int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
|
||||
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
|
||||
int ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
|
||||
int ompi_coll_tuned_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
|
||||
|
||||
int ompi_coll_tuned_reduce_scatter_inter_dec_fixed(REDUCESCATTER_ARGS);
|
||||
int ompi_coll_tuned_reduce_scatter_inter_dec_dynamic(REDUCESCATTER_ARGS);
|
||||
|
||||
@ -312,8 +209,6 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);
|
||||
int ompi_coll_tuned_scatter_intra_do_forced(SCATTER_ARGS);
|
||||
int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize);
|
||||
int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
|
||||
int ompi_coll_tuned_scatter_intra_basic_linear(SCATTER_ARGS);
|
||||
int ompi_coll_tuned_scatter_intra_binomial(SCATTER_ARGS);
|
||||
int ompi_coll_tuned_scatter_inter_dec_fixed(SCATTER_ARGS);
|
||||
int ompi_coll_tuned_scatter_inter_dec_dynamic(SCATTER_ARGS);
|
||||
|
||||
@ -325,16 +220,6 @@ int ompi_coll_tuned_scatterv_inter_dec_dynamic(SCATTERV_ARGS);
|
||||
|
||||
int mca_coll_tuned_ft_event(int state);
|
||||
|
||||
|
||||
/* Utility functions */
|
||||
|
||||
static inline void ompi_coll_tuned_free_reqs(ompi_request_t **reqs, int count)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < count; ++i)
|
||||
ompi_request_free(&reqs[i]);
|
||||
}
|
||||
|
||||
struct mca_coll_tuned_component_t {
|
||||
/** Base coll component */
|
||||
mca_coll_base_component_2_0_0_t super;
|
||||
@ -359,200 +244,17 @@ typedef struct mca_coll_tuned_component_t mca_coll_tuned_component_t;
|
||||
*/
|
||||
OMPI_MODULE_DECLSPEC extern mca_coll_tuned_component_t mca_coll_tuned_component;
|
||||
|
||||
/*
|
||||
* Data structure for hanging data off the communicator
|
||||
* i.e. per module instance
|
||||
*/
|
||||
struct mca_coll_tuned_comm_t {
|
||||
/* standard data for requests and PML usage */
|
||||
|
||||
/* Precreate space for requests
|
||||
* Note this does not effect basic,
|
||||
* but if in wrong context can confuse a debugger
|
||||
* this is controlled by an MCA param
|
||||
*/
|
||||
|
||||
ompi_request_t **mcct_reqs;
|
||||
int mcct_num_reqs;
|
||||
|
||||
/*
|
||||
* tuned topo information caching per communicator
|
||||
*
|
||||
* for each communicator we cache the topo information so we can
|
||||
* reuse without regenerating if we change the root, [or fanout]
|
||||
* then regenerate and recache this information
|
||||
*/
|
||||
|
||||
/* general tree with n fan out */
|
||||
ompi_coll_tree_t *cached_ntree;
|
||||
int cached_ntree_root;
|
||||
int cached_ntree_fanout;
|
||||
|
||||
/* binary tree */
|
||||
ompi_coll_tree_t *cached_bintree;
|
||||
int cached_bintree_root;
|
||||
|
||||
/* binomial tree */
|
||||
ompi_coll_tree_t *cached_bmtree;
|
||||
int cached_bmtree_root;
|
||||
|
||||
/* binomial tree */
|
||||
ompi_coll_tree_t *cached_in_order_bmtree;
|
||||
int cached_in_order_bmtree_root;
|
||||
|
||||
/* chained tree (fanout followed by pipelines) */
|
||||
ompi_coll_tree_t *cached_chain;
|
||||
int cached_chain_root;
|
||||
int cached_chain_fanout;
|
||||
|
||||
/* pipeline */
|
||||
ompi_coll_tree_t *cached_pipeline;
|
||||
int cached_pipeline_root;
|
||||
|
||||
/* in-order binary tree (root of the in-order binary tree is rank 0) */
|
||||
ompi_coll_tree_t *cached_in_order_bintree;
|
||||
|
||||
/* moving to the component */
|
||||
ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */
|
||||
|
||||
/* for forced algorithms we store the information on the module */
|
||||
/* previously we only had one shared copy, ops, it really is per comm/module */
|
||||
coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
|
||||
};
|
||||
typedef struct mca_coll_tuned_comm_t mca_coll_tuned_comm_t;
|
||||
|
||||
struct mca_coll_tuned_module_t {
|
||||
mca_coll_base_module_t super;
|
||||
|
||||
mca_coll_tuned_comm_t *tuned_data;
|
||||
mca_coll_base_module_t super;
|
||||
|
||||
/* for forced algorithms we store the information on the module */
|
||||
/* previously we only had one shared copy, ops, it really is per comm/module */
|
||||
coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
|
||||
|
||||
/* the communicator rules for each MPI collective for ONLY my comsize */
|
||||
ompi_coll_com_rule_t *com_rules[COLLCOUNT];
|
||||
};
|
||||
typedef struct mca_coll_tuned_module_t mca_coll_tuned_module_t;
|
||||
OBJ_CLASS_DECLARATION(mca_coll_tuned_module_t);
|
||||
|
||||
static inline void mca_coll_tuned_free_reqs(ompi_request_t ** reqs,
|
||||
int count)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < count; ++i)
|
||||
ompi_request_free(reqs + i);
|
||||
}
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#define COLL_TUNED_UPDATE_BINTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
|
||||
do { \
|
||||
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
|
||||
if( !( (coll_comm->cached_bintree) \
|
||||
&& (coll_comm->cached_bintree_root == (ROOT)) ) ) { \
|
||||
if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \
|
||||
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bintree) ); \
|
||||
} \
|
||||
coll_comm->cached_bintree = ompi_coll_tuned_topo_build_tree(2,(OMPI_COMM),(ROOT)); \
|
||||
coll_comm->cached_bintree_root = (ROOT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_TUNED_UPDATE_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
|
||||
do { \
|
||||
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
|
||||
if( !( (coll_comm->cached_bmtree) \
|
||||
&& (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \
|
||||
if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \
|
||||
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \
|
||||
} \
|
||||
coll_comm->cached_bmtree = ompi_coll_tuned_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \
|
||||
coll_comm->cached_bmtree_root = (ROOT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_TUNED_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
|
||||
do { \
|
||||
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
|
||||
if( !( (coll_comm->cached_in_order_bmtree) \
|
||||
&& (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \
|
||||
if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \
|
||||
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \
|
||||
} \
|
||||
coll_comm->cached_in_order_bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \
|
||||
coll_comm->cached_in_order_bmtree_root = (ROOT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_TUNED_UPDATE_PIPELINE( OMPI_COMM, TUNED_MODULE, ROOT ) \
|
||||
do { \
|
||||
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
|
||||
if( !( (coll_comm->cached_pipeline) \
|
||||
&& (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \
|
||||
if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \
|
||||
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \
|
||||
} \
|
||||
coll_comm->cached_pipeline = ompi_coll_tuned_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \
|
||||
coll_comm->cached_pipeline_root = (ROOT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_TUNED_UPDATE_CHAIN( OMPI_COMM, TUNED_MODULE, ROOT, FANOUT ) \
|
||||
do { \
|
||||
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
|
||||
if( !( (coll_comm->cached_chain) \
|
||||
&& (coll_comm->cached_chain_root == (ROOT)) \
|
||||
&& (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \
|
||||
if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \
|
||||
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_chain) ); \
|
||||
} \
|
||||
coll_comm->cached_chain = ompi_coll_tuned_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \
|
||||
coll_comm->cached_chain_root = (ROOT); \
|
||||
coll_comm->cached_chain_fanout = (FANOUT); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define COLL_TUNED_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, TUNED_MODULE ) \
|
||||
do { \
|
||||
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
|
||||
if( !(coll_comm->cached_in_order_bintree) ) { \
|
||||
/* In-order binary tree topology is defined by communicator size */ \
|
||||
/* Thus, there is no need to destroy anything */ \
|
||||
coll_comm->cached_in_order_bintree = \
|
||||
ompi_coll_tuned_topo_build_in_order_bintree((OMPI_COMM)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* This macro give a generic way to compute the best count of
|
||||
* the segment (i.e. the number of complete datatypes that
|
||||
* can fit in the specified SEGSIZE). Beware, when this macro
|
||||
* is called, the SEGCOUNT should be initialized to the count as
|
||||
* expected by the collective call.
|
||||
*/
|
||||
#define COLL_TUNED_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \
|
||||
if( ((SEGSIZE) >= (TYPELNG)) && \
|
||||
((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \
|
||||
size_t residual; \
|
||||
(SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \
|
||||
residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \
|
||||
if( residual > ((TYPELNG) >> 1) ) \
|
||||
(SEGCOUNT)++; \
|
||||
} \
|
||||
|
||||
/**
|
||||
* This macro gives a generic wait to compute the well distributed block counts
|
||||
* when the count and number of blocks are fixed.
|
||||
* Macro returns "early-block" count, "late-block" count, and "split-index"
|
||||
* which is the block at which we switch from "early-block" count to
|
||||
* the "late-block" count.
|
||||
* count = split_index * early_block_count +
|
||||
* (block_count - split_index) * late_block_count
|
||||
* We do not perform ANY error checks - make sure that the input values
|
||||
* make sense (eg. count > num_blocks).
|
||||
*/
|
||||
#define COLL_TUNED_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \
|
||||
EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
|
||||
EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \
|
||||
SPLIT_INDEX = COUNT % NUM_BLOCKS; \
|
||||
if (0 != SPLIT_INDEX) { \
|
||||
EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \
|
||||
} \
|
||||
|
||||
|
||||
#endif /* MCA_COLL_TUNED_EXPORT_H */
|
||||
|
||||
#endif /* MCA_COLL_TUNED_EXPORT_H */
|
||||
|
218
ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
Обычный файл
218
ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
Обычный файл
@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
#include "coll_tuned.h"
|
||||
|
||||
/* allgather algorithm variables */
|
||||
static int coll_tuned_allgather_forced_algorithm = 0;
|
||||
static int coll_tuned_allgather_segment_size = 0;
|
||||
static int coll_tuned_allgather_tree_fanout;
|
||||
static int coll_tuned_allgather_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_allgather_forced_algorithm */
|
||||
static mca_base_var_enum_value_t allgather_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "linear"},
|
||||
{2, "bruck"},
|
||||
{3, "recursive_doubling"},
|
||||
{4, "ring"},
|
||||
{5, "neighbor"},
|
||||
{6, "two_proc"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map
|
||||
routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values
|
||||
and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int
|
||||
ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != allgather_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHER] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm_count",
|
||||
"Number of allgather algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_allgather_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_allgather_algorithms", allgather_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm",
|
||||
"Which allallgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgather_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_allgather_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allgather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgather_segment_size);
|
||||
|
||||
coll_tuned_allgather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allgather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgather_tree_fanout);
|
||||
|
||||
coll_tuned_allgather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgather_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allgather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgather_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_allgather_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_do_forced selected algorithm %d",
|
||||
tuned_module->user_forced[ALLGATHER].algorithm));
|
||||
|
||||
switch (tuned_module->user_forced[ALLGATHER].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_allgather_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (4):
|
||||
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (5):
|
||||
return ompi_coll_base_allgather_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (6):
|
||||
return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[ALLGATHER].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allgather_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_allgather_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (4):
|
||||
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (5):
|
||||
return ompi_coll_base_allgather_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
case (6):
|
||||
return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
212
ompi/mca/coll/tuned/coll_tuned_allgatherv_decision.c
Обычный файл
212
ompi/mca/coll/tuned/coll_tuned_allgatherv_decision.c
Обычный файл
@ -0,0 +1,212 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
|
||||
/* allgatherv algorithm variables */
|
||||
static int coll_tuned_allgatherv_forced_algorithm = 0;
|
||||
static int coll_tuned_allgatherv_segment_size = 0;
|
||||
static int coll_tuned_allgatherv_tree_fanout;
|
||||
static int coll_tuned_allgatherv_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_allgatherv_forced_algorithm */
|
||||
static mca_base_var_enum_value_t allgatherv_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "default"},
|
||||
{2, "bruck"},
|
||||
{3, "ring"},
|
||||
{4, "neighbor"},
|
||||
{5, "two_proc"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map
|
||||
routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values
|
||||
and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int
|
||||
ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != allgatherv_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm_count",
|
||||
"Number of allgatherv algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_allgatherv_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_allgatherv_algorithms", allgatherv_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm",
|
||||
"Which allallgatherv algorithm is used. Can be locked down to choice of: 0 ignore, 1 default (allgathervv + bcast), 2 bruck, 3 ring, 4 neighbor exchange, 5: two proc only.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgatherv_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_allgatherv_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allgatherv algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgatherv_segment_size);
|
||||
|
||||
coll_tuned_allgatherv_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allgatherv algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgatherv_tree_fanout);
|
||||
|
||||
coll_tuned_allgatherv_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allgatherv_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allgatherv algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allgatherv_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_do_forced selected algorithm %d",
|
||||
tuned_module->user_forced[ALLGATHERV].algorithm));
|
||||
|
||||
switch (tuned_module->user_forced[ALLGATHERV].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allgatherv_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_allgatherv_intra_basic_default(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (4):
|
||||
return ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (5):
|
||||
return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[ALLGATHERV].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout,
|
||||
int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allgatherv_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_allgatherv_intra_basic_default(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (4):
|
||||
return ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
case (5):
|
||||
return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:allgatherv_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
182
ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
Обычный файл
182
ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
Обычный файл
@ -0,0 +1,182 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
|
||||
/* allreduce algorithm variables */
|
||||
static int coll_tuned_allreduce_algorithm_count = 5;
|
||||
static int coll_tuned_allreduce_forced_algorithm = 0;
|
||||
static int coll_tuned_allreduce_segment_size = 0;
|
||||
static int coll_tuned_allreduce_tree_fanout;
|
||||
static int coll_tuned_allreduce_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_allreduce_forced_algorithm */
|
||||
static mca_base_var_enum_value_t allreduce_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "nonoverlapping"},
|
||||
{3, "recursive_doubling"},
|
||||
{4, "ring"},
|
||||
{5, "segmented_ring"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != allreduce_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_count",
|
||||
"Number of allreduce algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_allreduce_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_allreduce_algorithms", allreduce_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm",
|
||||
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allreduce_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_allreduce_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allreduce_segment_size);
|
||||
|
||||
coll_tuned_allreduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allreduce_tree_fanout);
|
||||
|
||||
coll_tuned_allreduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_allreduce_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d, segment size %d",
|
||||
tuned_module->user_forced[ALLREDUCE].algorithm,
|
||||
tuned_module->user_forced[ALLREDUCE].segsize));
|
||||
|
||||
switch (tuned_module->user_forced[ALLREDUCE].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allreduce_intra_dec_fixed(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_allreduce_intra_basic_linear(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (4):
|
||||
return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (5):
|
||||
return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, tuned_module->user_forced[ALLREDUCE].segsize);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[ALLREDUCE].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_allreduce_intra_dec_fixed(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_allreduce_intra_basic_linear(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (4):
|
||||
return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module);
|
||||
case (5):
|
||||
return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
204
ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c
Обычный файл
204
ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c
Обычный файл
@ -0,0 +1,204 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
|
||||
/* alltoall algorithm variables */
|
||||
static int coll_tuned_alltoall_forced_algorithm = 0;
|
||||
static int coll_tuned_alltoall_segment_size = 0;
|
||||
static int coll_tuned_alltoall_max_requests;
|
||||
static int coll_tuned_alltoall_tree_fanout;
|
||||
static int coll_tuned_alltoall_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_alltoall_forced_algorithm */
|
||||
static mca_base_var_enum_value_t alltoall_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "linear"},
|
||||
{2, "pairwise"},
|
||||
{3, "modified_bruck"},
|
||||
{4, "linear_sync"},
|
||||
{5, "two_proc"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t*new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != alltoall_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_count",
|
||||
"Number of alltoall algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_alltoall_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_alltoall_algorithms", alltoall_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm",
|
||||
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: linear with sync, 5:two proc only.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_alltoall_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_segment_size);
|
||||
|
||||
coll_tuned_alltoall_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_tree_fanout);
|
||||
|
||||
coll_tuned_alltoall_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_chain_fanout",
|
||||
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_chain_fanout);
|
||||
|
||||
coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
|
||||
mca_param_indices->max_requests_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_max_requests",
|
||||
"Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoall_max_requests);
|
||||
if (mca_param_indices->max_requests_param_index < 0) {
|
||||
return mca_param_indices->max_requests_param_index;
|
||||
}
|
||||
|
||||
if (coll_tuned_alltoall_max_requests < 0) {
|
||||
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
||||
opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to system level default %d \n",
|
||||
ompi_coll_tuned_init_max_requests );
|
||||
}
|
||||
coll_tuned_alltoall_max_requests = 0;
|
||||
}
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d",
|
||||
tuned_module->user_forced[ALLTOALL].algorithm));
|
||||
|
||||
switch (tuned_module->user_forced[ALLTOALL].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (4):
|
||||
return ompi_coll_base_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module,
|
||||
tuned_module->user_forced[ALLTOALL].max_requests);
|
||||
case (5):
|
||||
return ompi_coll_base_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize,
|
||||
int max_requests)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_alltoall_intra_dec_fixed(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
case (4):
|
||||
return ompi_coll_base_alltoall_intra_linear_sync(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, max_requests);
|
||||
case (5):
|
||||
return ompi_coll_base_alltoall_intra_two_procs(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
156
ompi/mca/coll/tuned/coll_tuned_alltoallv_decision.c
Обычный файл
156
ompi/mca/coll/tuned/coll_tuned_alltoallv_decision.c
Обычный файл
@ -0,0 +1,156 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
|
||||
/* alltoallv algorithm variables */
|
||||
static int coll_tuned_alltoallv_algorithm_count = 2;
|
||||
static int coll_tuned_alltoallv_forced_algorithm = 0;
|
||||
|
||||
/* valid values for coll_tuned_alltoallv_forced_algorithm */
|
||||
static mca_base_var_enum_value_t alltoallv_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "pairwise"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/*
|
||||
* The following are used by dynamic and forced rules. Publish
|
||||
* details of each algorithm and if its forced/fixed/locked in as you add
|
||||
* methods/algorithms you must update this and the query/map routines.
|
||||
* This routine is called by the component only. This makes sure that
|
||||
* the mca parameters are set to their initial values and perms.
|
||||
* Module does not call this. They call the forced_getvalues routine
|
||||
* instead.
|
||||
*/
|
||||
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t
|
||||
*mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != alltoallv_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoallv_algorithm_count",
|
||||
"Number of alltoallv algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_alltoallv_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_alltoallv_algorithms", alltoallv_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoallv_algorithm",
|
||||
"Which alltoallv algorithm is used. "
|
||||
"Can be locked down to choice of: 0 ignore, "
|
||||
"1 basic linear, 2 pairwise.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_alltoallv_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoallv_intra_do_forced(void *sbuf, int *scounts, int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoallv_intra_do_forced selected algorithm %d",
|
||||
tuned_module->user_forced[ALLTOALLV].algorithm));
|
||||
|
||||
switch (tuned_module->user_forced[ALLTOALLV].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoallv_intra_do_forced attempt to "
|
||||
"select algorithm %d when only 0-%d is valid.",
|
||||
tuned_module->user_forced[ALLTOALLV].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
/* If the user selects dynamic rules and specifies the algorithm to
|
||||
* use, then this function is called. */
|
||||
int ompi_coll_tuned_alltoallv_intra_do_this(void *sbuf, int *scounts, int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoallv_intra_do_this selected algorithm %d ",
|
||||
algorithm));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:alltoall_intra_do_this attempt to select "
|
||||
"algorithm %d when only 0-%d is valid.",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
135
ompi/mca/coll/tuned/coll_tuned_barrier_decision.c
Обычный файл
135
ompi/mca/coll/tuned/coll_tuned_barrier_decision.c
Обычный файл
@ -0,0 +1,135 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
|
||||
/* barrier algorithm variables */
|
||||
static int coll_tuned_barrier_forced_algorithm = 0;
|
||||
|
||||
/* valid values for coll_tuned_barrier_forced_algorithm */
|
||||
static mca_base_var_enum_value_t barrier_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "linear"},
|
||||
{2, "double_ring"},
|
||||
{3, "recursive_doubling"},
|
||||
{4, "bruck"},
|
||||
{5, "two_proc"},
|
||||
{6, "tree"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map */
|
||||
/* routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values */
|
||||
/* and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != barrier_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[BARRIER] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm_count",
|
||||
"Number of barrier algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_barrier_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_barrier_algorithms", barrier_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm",
|
||||
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_barrier_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:barrier_intra_do_forced selected algorithm %d",
|
||||
tuned_module->user_forced[BARRIER].algorithm));
|
||||
|
||||
switch (tuned_module->user_forced[BARRIER].algorithm) {
|
||||
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed(comm, module);
|
||||
case (1): return ompi_coll_base_barrier_intra_basic_linear(comm, module);
|
||||
case (2): return ompi_coll_base_barrier_intra_doublering(comm, module);
|
||||
case (3): return ompi_coll_base_barrier_intra_recursivedoubling(comm, module);
|
||||
case (4): return ompi_coll_base_barrier_intra_bruck(comm, module);
|
||||
case (5): return ompi_coll_base_barrier_intra_two_procs(comm, module);
|
||||
case (6): return ompi_coll_base_barrier_intra_tree(comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[BARRIER].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d",
|
||||
algorithm, faninout));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed(comm, module);
|
||||
case (1): return ompi_coll_base_barrier_intra_basic_linear(comm, module);
|
||||
case (2): return ompi_coll_base_barrier_intra_doublering(comm, module);
|
||||
case (3): return ompi_coll_base_barrier_intra_recursivedoubling(comm, module);
|
||||
case (4): return ompi_coll_base_barrier_intra_bruck(comm, module);
|
||||
case (5): return ompi_coll_base_barrier_intra_two_procs(comm, module);
|
||||
case (6): return ompi_coll_base_barrier_intra_tree(comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
183
ompi/mca/coll/tuned/coll_tuned_bcast_decision.c
Обычный файл
183
ompi/mca/coll/tuned/coll_tuned_bcast_decision.c
Обычный файл
@ -0,0 +1,183 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
|
||||
/* bcast algorithm variables */
|
||||
static int coll_tuned_bcast_algorithm_count = 6;
|
||||
static int coll_tuned_bcast_forced_algorithm = 0;
|
||||
static int coll_tuned_bcast_segment_size = 0;
|
||||
static int coll_tuned_bcast_tree_fanout;
|
||||
static int coll_tuned_bcast_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_bcast_forced_algorithm */
|
||||
static mca_base_var_enum_value_t bcast_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "chain"},
|
||||
{3, "pipeline"},
|
||||
{4, "split_binary_tree"},
|
||||
{5, "binary_tree"},
|
||||
{6, "binomial"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != bcast_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[BCAST] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_count",
|
||||
"Number of bcast algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_bcast_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_bcast_algorithms", bcast_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm",
|
||||
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: binomial tree.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_bcast_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_bcast_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_bcast_segment_size);
|
||||
|
||||
coll_tuned_bcast_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_bcast_tree_fanout);
|
||||
|
||||
coll_tuned_bcast_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_chain_fanout",
|
||||
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_bcast_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d",
|
||||
tuned_module->user_forced[BCAST].algorithm));
|
||||
|
||||
switch (tuned_module->user_forced[BCAST].algorithm) {
|
||||
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
|
||||
case (1): return ompi_coll_base_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
|
||||
case (2): return ompi_coll_base_bcast_intra_chain( buf, count, dtype, root, comm, module,
|
||||
tuned_module->user_forced[BCAST].segsize,
|
||||
tuned_module->user_forced[BCAST].chain_fanout );
|
||||
case (3): return ompi_coll_base_bcast_intra_pipeline( buf, count, dtype, root, comm, module,
|
||||
tuned_module->user_forced[BCAST].segsize );
|
||||
case (4): return ompi_coll_base_bcast_intra_split_bintree( buf, count, dtype, root, comm, module,
|
||||
tuned_module->user_forced[BCAST].segsize );
|
||||
case (5): return ompi_coll_base_bcast_intra_bintree( buf, count, dtype, root, comm, module,
|
||||
tuned_module->user_forced[BCAST].segsize );
|
||||
case (6): return ompi_coll_base_bcast_intra_binomial( buf, count, dtype, root, comm, module,
|
||||
tuned_module->user_forced[BCAST].segsize );
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
|
||||
case (1):
|
||||
return ompi_coll_base_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
|
||||
case (2):
|
||||
return ompi_coll_base_bcast_intra_chain( buf, count, dtype, root, comm, module, segsize, faninout );
|
||||
case (3):
|
||||
return ompi_coll_base_bcast_intra_pipeline( buf, count, dtype, root, comm, module, segsize );
|
||||
case (4):
|
||||
return ompi_coll_base_bcast_intra_split_bintree( buf, count, dtype, root, comm, module, segsize );
|
||||
case (5):
|
||||
return ompi_coll_base_bcast_intra_bintree( buf, count, dtype, root, comm, module, segsize );
|
||||
case (6):
|
||||
return ompi_coll_base_bcast_intra_binomial( buf, count, dtype, root, comm, module, segsize );
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
@ -2,10 +2,10 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
@ -44,7 +44,6 @@ const char *ompi_coll_tuned_component_version_string =
|
||||
*/
|
||||
int ompi_coll_tuned_stream = -1;
|
||||
int ompi_coll_tuned_priority = 30;
|
||||
int ompi_coll_tuned_preallocate_memory_comm_size_limit = (32 * 1024);
|
||||
bool ompi_coll_tuned_use_dynamic_rules = false;
|
||||
char* ompi_coll_tuned_dynamic_rules_filename = (char*) NULL;
|
||||
int ompi_coll_tuned_init_tree_fanout = 4;
|
||||
@ -121,16 +120,6 @@ static int tuned_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_coll_tuned_priority);
|
||||
|
||||
/* parameter for pre-allocated memory requests etc */
|
||||
ompi_coll_tuned_preallocate_memory_comm_size_limit = (32 * 1024);
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"pre_allocate_memory_comm_size_limit",
|
||||
"Size of communicator were we stop pre-allocating memory for the fixed internal buffer used for message requests etc that is hung off the communicator data segment. I.e. if you have a 100'000 nodes you might not want to pre-allocate 200'000 request handle slots per communicator instance!",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_6,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&ompi_coll_tuned_preallocate_memory_comm_size_limit);
|
||||
|
||||
/* some initial guesses at topology parameters */
|
||||
ompi_coll_tuned_init_tree_fanout = 4;
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
@ -272,56 +261,13 @@ static int tuned_close(void)
|
||||
static void
|
||||
mca_coll_tuned_module_construct(mca_coll_tuned_module_t *module)
|
||||
{
|
||||
module->tuned_data = NULL;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
mca_coll_tuned_module_destruct(mca_coll_tuned_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_comm_t *data;
|
||||
|
||||
/* Free the space in the data mpool and the data hanging off the
|
||||
communicator */
|
||||
|
||||
data = module->tuned_data;
|
||||
if (NULL != data) {
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
/* Reset the reqs to NULL/0 -- they'll be freed as part of freeing
|
||||
the generel c_coll_selected_data */
|
||||
data->mcct_reqs = NULL;
|
||||
data->mcct_num_reqs = 0;
|
||||
#endif
|
||||
|
||||
/* free any cached information that has been allocated */
|
||||
if (data->cached_ntree) { /* destroy general tree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_ntree);
|
||||
}
|
||||
if (data->cached_bintree) { /* destroy bintree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_bintree);
|
||||
}
|
||||
if (data->cached_bmtree) { /* destroy bmtree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_bmtree);
|
||||
}
|
||||
if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bmtree);
|
||||
}
|
||||
if (data->cached_chain) { /* destroy general chain if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_chain);
|
||||
}
|
||||
if (data->cached_pipeline) { /* destroy pipeline if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_pipeline);
|
||||
}
|
||||
if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bintree);
|
||||
}
|
||||
|
||||
free(data);
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
for( int i = 0; i < COLLCOUNT; i++ ) {
|
||||
tuned_module->user_forced[i].algorithm = 0;
|
||||
tuned_module->com_rules[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_coll_tuned_module_t,
|
||||
mca_coll_base_module_t,
|
||||
mca_coll_tuned_module_construct,
|
||||
mca_coll_tuned_module_destruct);
|
||||
OBJ_CLASS_INSTANCE(mca_coll_tuned_module_t, mca_coll_base_module_t,
|
||||
mca_coll_tuned_module_construct, NULL);
|
||||
|
@ -2,18 +2,18 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2012 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -28,13 +28,10 @@
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "coll_tuned.h"
|
||||
|
||||
#include "coll_tuned.h"
|
||||
|
||||
|
||||
/*
|
||||
* Notes on evaluation rules and ordering
|
||||
*
|
||||
* The order is:
|
||||
* Notes on evaluation rules and ordering
|
||||
*
|
||||
* The order is:
|
||||
* use file based rules if presented (-coll_tuned_dynamic_rules_filename = rules)
|
||||
* Else
|
||||
* use forced rules (-coll_tuned_dynamic_ALG_intra_algorithm = algorithm-number)
|
||||
@ -58,12 +55,11 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[ALLREDUCE]) {
|
||||
if (tuned_module->com_rules[ALLREDUCE]) {
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
size_t dsize;
|
||||
@ -71,7 +67,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
ompi_datatype_type_size (dtype, &dsize);
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLREDUCE],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLREDUCE],
|
||||
dsize, &faninout, &segsize, &ignoreme);
|
||||
|
||||
if (alg) {
|
||||
@ -82,7 +78,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[ALLREDUCE].algorithm) {
|
||||
if (tuned_module->user_forced[ALLREDUCE].algorithm) {
|
||||
return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
@ -91,27 +87,26 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
}
|
||||
|
||||
/*
|
||||
* alltoall_intra_dec
|
||||
* alltoall_intra_dec
|
||||
*
|
||||
* Function: - seletects alltoall algorithm to use
|
||||
* Accepts: - same arguments as MPI_Alltoall()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the bcast implementation)
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[ALLTOALL]) {
|
||||
if (tuned_module->com_rules[ALLTOALL]) {
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int comsize;
|
||||
int alg, faninout, segsize, max_requests;
|
||||
@ -121,7 +116,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
comsize = ompi_comm_size(comm);
|
||||
dsize *= (ptrdiff_t)comsize * (ptrdiff_t)scount;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLTOALL],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLTOALL],
|
||||
dsize, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
@ -133,7 +128,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[ALLTOALL].algorithm) {
|
||||
if (tuned_module->user_forced[ALLTOALL].algorithm) {
|
||||
return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
@ -152,12 +147,11 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoallv_intra_dec_dynamic"));
|
||||
|
||||
@ -167,10 +161,10 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
|
||||
* This allow the users to specify the alltoallv algorithm to be used only
|
||||
* based on the communicator size.
|
||||
*/
|
||||
if (data->com_rules[ALLTOALLV]) {
|
||||
if (tuned_module->com_rules[ALLTOALLV]) {
|
||||
int alg, faninout, segsize, max_requests;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLTOALLV],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLTOALLV],
|
||||
0, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
@ -182,7 +176,7 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[ALLTOALLV].algorithm) {
|
||||
if (tuned_module->user_forced[ALLTOALLV].algorithm) {
|
||||
return ompi_coll_tuned_alltoallv_intra_do_forced(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
@ -193,7 +187,7 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
|
||||
}
|
||||
|
||||
/*
|
||||
* barrier_intra_dec
|
||||
* barrier_intra_dec
|
||||
*
|
||||
* Function: - seletects barrier algorithm to use
|
||||
* Accepts: - same arguments as MPI_Barrier()
|
||||
@ -203,16 +197,15 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_barrier_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[BARRIER]) {
|
||||
if (tuned_module->com_rules[BARRIER]) {
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[BARRIER],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[BARRIER],
|
||||
0, &faninout, &segsize, &ignoreme);
|
||||
|
||||
if (alg) {
|
||||
@ -222,14 +215,14 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[BARRIER].algorithm) {
|
||||
if (tuned_module->user_forced[BARRIER].algorithm) {
|
||||
return ompi_coll_tuned_barrier_intra_do_forced (comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module);
|
||||
}
|
||||
|
||||
/*
|
||||
* bcast_intra_dec
|
||||
* bcast_intra_dec
|
||||
*
|
||||
* Function: - seletects broadcast algorithm to use
|
||||
* Accepts: - same arguments as MPI_Bcast()
|
||||
@ -241,12 +234,11 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:bcast_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[BCAST]) {
|
||||
if (tuned_module->com_rules[BCAST]) {
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
size_t dsize;
|
||||
@ -254,7 +246,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
ompi_datatype_type_size (datatype, &dsize);
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[BCAST],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[BCAST],
|
||||
dsize, &faninout, &segsize, &ignoreme);
|
||||
|
||||
if (alg) {
|
||||
@ -266,7 +258,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
} /*end if any com rules to check */
|
||||
|
||||
|
||||
if (data->user_forced[BCAST].algorithm) {
|
||||
if (tuned_module->user_forced[BCAST].algorithm) {
|
||||
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root,
|
||||
comm, module);
|
||||
}
|
||||
@ -275,12 +267,12 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
}
|
||||
|
||||
/*
|
||||
* reduce_intra_dec
|
||||
* reduce_intra_dec
|
||||
*
|
||||
* Function: - seletects reduce algorithm to use
|
||||
* Accepts: - same arguments as MPI_reduce()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the reduce implementation)
|
||||
*
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
int count, struct ompi_datatype_t* datatype,
|
||||
@ -289,12 +281,11 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[REDUCE]) {
|
||||
if (tuned_module->com_rules[REDUCE]) {
|
||||
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize, max_requests;
|
||||
@ -303,21 +294,21 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
ompi_datatype_type_size (datatype, &dsize);
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[REDUCE],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[REDUCE],
|
||||
dsize, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for this message size */
|
||||
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
|
||||
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
|
||||
op, root,
|
||||
comm, module,
|
||||
alg, faninout,
|
||||
segsize,
|
||||
alg, faninout,
|
||||
segsize,
|
||||
max_requests);
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[REDUCE].algorithm) {
|
||||
if (tuned_module->user_forced[REDUCE].algorithm) {
|
||||
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype,
|
||||
op, root,
|
||||
comm, module);
|
||||
@ -328,15 +319,15 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
}
|
||||
|
||||
/*
|
||||
* reduce_scatter_intra_dec
|
||||
* reduce_scatter_intra_dec
|
||||
*
|
||||
* Function: - seletects reduce_scatter algorithm to use
|
||||
* Accepts: - same arguments as MPI_Reduce_scatter()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from
|
||||
* the reduce_scatter implementation)
|
||||
*
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
int *rcounts,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
@ -344,13 +335,12 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[REDUCESCATTER]) {
|
||||
/* we do, so calc the message size or what ever we need and use
|
||||
if (tuned_module->com_rules[REDUCESCATTER]) {
|
||||
/* we do, so calc the message size or what ever we need and use
|
||||
this for the evaluation */
|
||||
int alg, faninout, segsize, ignoreme, i, count, size;
|
||||
size_t dsize;
|
||||
@ -359,21 +349,21 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
ompi_datatype_type_size (dtype, &dsize);
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[REDUCESCATTER],
|
||||
dsize, &faninout,
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[REDUCESCATTER],
|
||||
dsize, &faninout,
|
||||
&segsize, &ignoreme);
|
||||
if (alg) {
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for this message size */
|
||||
return ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module,
|
||||
alg, faninout,
|
||||
alg, faninout,
|
||||
segsize);
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[REDUCESCATTER].algorithm) {
|
||||
return ompi_coll_tuned_reduce_scatter_intra_do_forced (sbuf, rbuf, rcounts,
|
||||
|
||||
if (tuned_module->user_forced[REDUCESCATTER].algorithm) {
|
||||
return ompi_coll_tuned_reduce_scatter_intra_do_forced (sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
@ -383,7 +373,7 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
}
|
||||
|
||||
/*
|
||||
* allgather_intra_dec
|
||||
* allgather_intra_dec
|
||||
*
|
||||
* Function: - seletects allgather algorithm to use
|
||||
* Accepts: - same arguments as MPI_Allgather()
|
||||
@ -391,58 +381,57 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
* allgather function).
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_allgather_intra_dec_dynamic"));
|
||||
|
||||
if (data->com_rules[ALLGATHER]) {
|
||||
|
||||
if (tuned_module->com_rules[ALLGATHER]) {
|
||||
/* We have file based rules:
|
||||
- calculate message size and other necessary information */
|
||||
int comsize;
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
size_t dsize;
|
||||
|
||||
|
||||
ompi_datatype_type_size (sdtype, &dsize);
|
||||
comsize = ompi_comm_size(comm);
|
||||
dsize *= (ptrdiff_t)comsize * (ptrdiff_t)scount;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHER],
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHER],
|
||||
dsize, &faninout, &segsize, &ignoreme);
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for
|
||||
this message size */
|
||||
return ompi_coll_tuned_allgather_intra_do_this (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module,
|
||||
alg, faninout, segsize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We do not have file based rules */
|
||||
if (data->user_forced[ALLGATHER].algorithm) {
|
||||
if (tuned_module->user_forced[ALLGATHER].algorithm) {
|
||||
/* User-forced algorithm */
|
||||
return ompi_coll_tuned_allgather_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
return ompi_coll_tuned_allgather_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/* Use default decision */
|
||||
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/*
|
||||
* allgatherv_intra_dec
|
||||
* allgatherv_intra_dec
|
||||
*
|
||||
* Function: - seletects allgatherv algorithm to use
|
||||
* Accepts: - same arguments as MPI_Allgatherv()
|
||||
@ -450,71 +439,69 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
* allgatherv function).
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts,
|
||||
void* rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_allgatherv_intra_dec_dynamic"));
|
||||
|
||||
if (data->com_rules[ALLGATHERV]) {
|
||||
|
||||
if (tuned_module->com_rules[ALLGATHERV]) {
|
||||
/* We have file based rules:
|
||||
- calculate message size and other necessary information */
|
||||
int comsize, i;
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
size_t dsize, total_size;
|
||||
|
||||
comsize = ompi_comm_size(comm);
|
||||
comsize = ompi_comm_size(comm);
|
||||
ompi_datatype_type_size (sdtype, &dsize);
|
||||
total_size = 0;
|
||||
for (i = 0; i < comsize; i++) { total_size += dsize * rcounts[i]; }
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHERV],
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHERV],
|
||||
total_size, &faninout, &segsize, &ignoreme);
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for
|
||||
this message size */
|
||||
return ompi_coll_tuned_allgatherv_intra_do_this (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
comm, module,
|
||||
alg, faninout, segsize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We do not have file based rules */
|
||||
if (data->user_forced[ALLGATHERV].algorithm) {
|
||||
if (tuned_module->user_forced[ALLGATHERV].algorithm) {
|
||||
/* User-forced algorithm */
|
||||
return ompi_coll_tuned_allgatherv_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
return ompi_coll_tuned_allgatherv_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/* Use default decision */
|
||||
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_gather_intra_dec_dynamic"));
|
||||
@ -522,15 +509,15 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
/**
|
||||
* check to see if we have some filebased rules.
|
||||
*/
|
||||
if (data->com_rules[GATHER]) {
|
||||
if (tuned_module->com_rules[GATHER]) {
|
||||
int comsize, alg, faninout, segsize, max_requests;
|
||||
size_t dsize;
|
||||
|
||||
comsize = ompi_comm_size(comm);
|
||||
comsize = ompi_comm_size(comm);
|
||||
ompi_datatype_type_size (sdtype, &dsize);
|
||||
dsize *= comsize;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[GATHER],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[GATHER],
|
||||
dsize, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
@ -542,26 +529,25 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[GATHER].algorithm) {
|
||||
if (tuned_module->user_forced[GATHER].algorithm) {
|
||||
return ompi_coll_tuned_gather_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_scatter_intra_dec_dynamic"));
|
||||
@ -569,15 +555,15 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
||||
/**
|
||||
* check to see if we have some filebased rules.
|
||||
*/
|
||||
if (data->com_rules[SCATTER]) {
|
||||
if (tuned_module->com_rules[SCATTER]) {
|
||||
int comsize, alg, faninout, segsize, max_requests;
|
||||
size_t dsize;
|
||||
|
||||
comsize = ompi_comm_size(comm);
|
||||
comsize = ompi_comm_size(comm);
|
||||
ompi_datatype_type_size (sdtype, &dsize);
|
||||
dsize *= comsize;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[SCATTER],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[SCATTER],
|
||||
dsize, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
@ -589,13 +575,13 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[SCATTER].algorithm) {
|
||||
if (tuned_module->user_forced[SCATTER].algorithm) {
|
||||
return ompi_coll_tuned_scatter_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
@ -3,10 +3,10 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2012 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
@ -14,9 +14,9 @@
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -31,7 +31,6 @@
|
||||
#include "ompi/op/op.h"
|
||||
#include "coll_tuned.h"
|
||||
|
||||
|
||||
/*
|
||||
* allreduce_intra
|
||||
*
|
||||
@ -40,11 +39,11 @@
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
ompi_coll_tuned_allreduce_intra_dec_fixed(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
size_t dsize, block_dsize;
|
||||
int comm_size = ompi_comm_size(comm);
|
||||
@ -53,8 +52,8 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
||||
|
||||
/**
|
||||
* Decision function based on MX results from the Grig cluster at UTK.
|
||||
*
|
||||
* Currently, linear, recursive doubling, and nonoverlapping algorithms
|
||||
*
|
||||
* Currently, linear, recursive doubling, and nonoverlapping algorithms
|
||||
* can handle both commutative and non-commutative operations.
|
||||
* Ring algorithm does not support non-commutative operations.
|
||||
*/
|
||||
@ -62,40 +61,40 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
||||
block_dsize = dsize * (ptrdiff_t)count;
|
||||
|
||||
if (block_dsize < intermediate_message) {
|
||||
return (ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf,
|
||||
count, dtype,
|
||||
op, comm, module));
|
||||
}
|
||||
return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
|
||||
count, dtype,
|
||||
op, comm, module));
|
||||
}
|
||||
|
||||
if( ompi_op_is_commute(op) && (count > comm_size) ) {
|
||||
const size_t segment_size = 1 << 20; /* 1 MB */
|
||||
if (((size_t)comm_size * (size_t)segment_size >= block_dsize)) {
|
||||
return (ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype,
|
||||
op, comm, module));
|
||||
return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype,
|
||||
op, comm, module));
|
||||
} else {
|
||||
return (ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
|
||||
count, dtype,
|
||||
op, comm, module,
|
||||
segment_size));
|
||||
return (ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf,
|
||||
count, dtype,
|
||||
op, comm, module,
|
||||
segment_size));
|
||||
}
|
||||
}
|
||||
|
||||
return (ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count,
|
||||
dtype, op, comm, module));
|
||||
return (ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count,
|
||||
dtype, op, comm, module));
|
||||
}
|
||||
|
||||
/*
|
||||
* alltoall_intra_dec
|
||||
* alltoall_intra_dec
|
||||
*
|
||||
* Function: - seletects alltoall algorithm to use
|
||||
* Accepts: - same arguments as MPI_Alltoall()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the bcast implementation)
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
@ -109,12 +108,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
|
||||
/* special case */
|
||||
if (communicator_size==2) {
|
||||
return ompi_coll_tuned_alltoall_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoall_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/* Decision function based on measurement on Grig cluster at
|
||||
/* Decision function based on measurement on Grig cluster at
|
||||
the University of Tennessee (2GB MX) up to 64 nodes.
|
||||
Has better performance for messages of intermediate sizes than the old one */
|
||||
/* determine block size */
|
||||
@ -123,19 +122,19 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
|
||||
if ((block_dsize < (size_t) ompi_coll_tuned_alltoall_small_msg)
|
||||
&& (communicator_size > 12)) {
|
||||
return ompi_coll_tuned_alltoall_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
|
||||
} else if (block_dsize < (size_t) ompi_coll_tuned_alltoall_intermediate_msg) {
|
||||
return ompi_coll_tuned_alltoall_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
|
||||
#if 0
|
||||
/* previous decision */
|
||||
@ -148,12 +147,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
ompi_comm_rank(comm), communicator_size, total_dsize));
|
||||
|
||||
if (communicator_size >= 12 && total_dsize <= 768) {
|
||||
return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
}
|
||||
if (total_dsize <= 131072) {
|
||||
return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -170,14 +169,14 @@ int ompi_coll_tuned_alltoallv_intra_dec_fixed(void *sbuf, int *scounts, int *sdi
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
/* For starters, just keep the original algorithm. */
|
||||
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps,rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps,rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* barrier_intra_dec
|
||||
* barrier_intra_dec
|
||||
*
|
||||
* Function: - seletects barrier algorithm to use
|
||||
* Accepts: - same arguments as MPI_Barrier()
|
||||
@ -192,7 +191,7 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
|
||||
communicator_size));
|
||||
|
||||
if( 2 == communicator_size )
|
||||
return ompi_coll_tuned_barrier_intra_two_procs(comm, module);
|
||||
return ompi_coll_base_barrier_intra_two_procs(comm, module);
|
||||
/**
|
||||
* Basic optimisation. If we have a power of 2 number of nodes
|
||||
* the use the recursive doubling algorithm, otherwise
|
||||
@ -203,19 +202,17 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
|
||||
for( ; communicator_size > 0; communicator_size >>= 1 ) {
|
||||
if( communicator_size & 0x1 ) {
|
||||
if( has_one )
|
||||
return ompi_coll_tuned_barrier_intra_bruck(comm, module);
|
||||
return ompi_coll_base_barrier_intra_bruck(comm, module);
|
||||
has_one = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ompi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
|
||||
/* return ompi_coll_tuned_barrier_intra_linear(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_doublering(comm); */
|
||||
return ompi_coll_base_barrier_intra_recursivedoubling(comm, module);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* bcast_intra_dec
|
||||
* bcast_intra_dec
|
||||
*
|
||||
* Function: - seletects broadcast algorithm to use
|
||||
* Accepts: - same arguments as MPI_Bcast()
|
||||
@ -226,14 +223,14 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
/* Decision function based on MX results for
|
||||
/* Decision function based on MX results for
|
||||
messages up to 36MB and communicator sizes up to 64 nodes */
|
||||
const size_t small_message_size = 2048;
|
||||
const size_t intermediate_message_size = 370728;
|
||||
const double a_p16 = 3.2118e-6; /* [1 / byte] */
|
||||
const double b_p16 = 8.7936;
|
||||
const double b_p16 = 8.7936;
|
||||
const double a_p64 = 2.3679e-6; /* [1 / byte] */
|
||||
const double b_p64 = 1.1787;
|
||||
const double b_p64 = 1.1787;
|
||||
const double a_p128 = 1.6134e-6; /* [1 / byte] */
|
||||
const double b_p128 = 2.1102;
|
||||
|
||||
@ -251,95 +248,95 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
" root %d rank %d com_size %d msg_length %lu",
|
||||
root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));
|
||||
|
||||
/* Handle messages of small and intermediate size, and
|
||||
/* Handle messages of small and intermediate size, and
|
||||
single-element broadcasts */
|
||||
if ((message_size < small_message_size) || (count <= 1)) {
|
||||
/* Binomial without segmentation */
|
||||
segsize = 0;
|
||||
return ompi_coll_tuned_bcast_intra_binomial (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_binomial(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
} else if (message_size < intermediate_message_size) {
|
||||
/* SplittedBinary with 1KB segments */
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
}
|
||||
}
|
||||
/* Handle large message sizes */
|
||||
else if (communicator_size < (a_p128 * message_size + b_p128)) {
|
||||
/* Pipeline with 128KB segments */
|
||||
segsize = 1024 << 7;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
} else if (communicator_size < 13) {
|
||||
/* Split Binary with 8KB segments */
|
||||
segsize = 1024 << 3;
|
||||
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
} else if (communicator_size < (a_p64 * message_size + b_p64)) {
|
||||
/* Pipeline with 64KB segments */
|
||||
segsize = 1024 << 6;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
} else if (communicator_size < (a_p16 * message_size + b_p16)) {
|
||||
/* Pipeline with 16KB segments */
|
||||
segsize = 1024 << 4;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
}
|
||||
|
||||
/* Pipeline with 8KB segments */
|
||||
segsize = 1024 << 3;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
#if 0
|
||||
/* this is based on gige measurements */
|
||||
|
||||
if (communicator_size < 4) {
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
|
||||
return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
|
||||
}
|
||||
if (communicator_size == 4) {
|
||||
if (message_size < 524288) segsize = 0;
|
||||
else segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
|
||||
return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
|
||||
}
|
||||
if (communicator_size <= 8 && message_size < 4096) {
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
|
||||
return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
|
||||
}
|
||||
if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
|
||||
return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
|
||||
}
|
||||
if (message_size >= 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype, root, comm, module, segsize);
|
||||
}
|
||||
segsize = 0;
|
||||
/* once tested can swap this back in */
|
||||
/* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
|
||||
/* return ompi_coll_base_bcast_intra_bmtree(buff, count, datatype, root, comm, segsize); */
|
||||
return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
|
||||
#endif /* 0 */
|
||||
}
|
||||
|
||||
/*
|
||||
* reduce_intra_dec
|
||||
* reduce_intra_dec
|
||||
*
|
||||
* Function: - seletects reduce algorithm to use
|
||||
* Accepts: - same arguments as MPI_reduce()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the reduce implementation)
|
||||
*
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
int count, struct ompi_datatype_t* datatype,
|
||||
@ -367,15 +364,15 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
message_size = dsize * (ptrdiff_t)count; /* needed for decision */
|
||||
|
||||
/**
|
||||
* If the operation is non commutative we currently have choice of linear
|
||||
* If the operation is non commutative we currently have choice of linear
|
||||
* or in-order binary tree algorithm.
|
||||
*/
|
||||
if( !ompi_op_is_commute(op) ) {
|
||||
if ((communicator_size < 12) && (message_size < 2048)) {
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
0, max_requests);
|
||||
return ompi_coll_base_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
}
|
||||
return ompi_coll_base_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
0, max_requests);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed "
|
||||
@ -384,27 +381,27 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
|
||||
if ((communicator_size < 8) && (message_size < 512)){
|
||||
/* Linear_0K */
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
} else if (((communicator_size < 8) && (message_size < 20480)) ||
|
||||
(message_size < 2048) || (count <= 1)) {
|
||||
/* Binomial_0K */
|
||||
segsize = 0;
|
||||
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
} else if (communicator_size > (a1 * message_size + b1)) {
|
||||
/* Binomial_1K */
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
} else if (communicator_size > (a2 * message_size + b2)) {
|
||||
/* Pipeline_1K */
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
} else if (communicator_size > (a3 * message_size + b3)) {
|
||||
/* Binary_32K */
|
||||
segsize = 32*1024;
|
||||
return ompi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
|
||||
return ompi_coll_base_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
|
||||
comm, module, segsize, max_requests);
|
||||
}
|
||||
if (communicator_size > (a4 * message_size + b4)) {
|
||||
@ -414,8 +411,8 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
/* Pipeline_64K */
|
||||
segsize = 64*1024;
|
||||
}
|
||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
|
||||
#if 0
|
||||
/* for small messages use linear algorithm */
|
||||
@ -424,8 +421,7 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
fanout = communicator_size - 1;
|
||||
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
|
||||
/* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
/* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
|
||||
return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
}
|
||||
if (message_size < 524288) {
|
||||
if (message_size <= 65536 ) {
|
||||
@ -437,21 +433,21 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
}
|
||||
/* later swap this for a binary tree */
|
||||
/* fanout = 2; */
|
||||
return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, fanout, max_requests);
|
||||
return ompi_coll_base_reduce_intra_chain(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, fanout, max_requests);
|
||||
}
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
#endif /* 0 */
|
||||
}
|
||||
|
||||
/*
|
||||
* reduce_scatter_intra_dec
|
||||
* reduce_scatter_intra_dec
|
||||
*
|
||||
* Function: - seletects reduce_scatter algorithm to use
|
||||
* Accepts: - same arguments as MPI_Reduce_scatter()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from
|
||||
* Returns: - MPI_SUCCESS or error code (passed from
|
||||
* the reduce scatter implementation)
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
|
||||
@ -474,16 +470,16 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
|
||||
/* We need data size for decision function */
|
||||
ompi_datatype_type_size(dtype, &dsize);
|
||||
total_message_size = 0;
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
total_message_size += rcounts[i];
|
||||
}
|
||||
|
||||
if( !ompi_op_is_commute(op) ) {
|
||||
return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
|
||||
total_message_size *= dsize;
|
||||
|
||||
/* compute the nearest power of 2 */
|
||||
@ -492,18 +488,18 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
|
||||
if ((total_message_size <= small_message_size) ||
|
||||
((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
|
||||
(comm_size >= a * total_message_size + b)) {
|
||||
return
|
||||
ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
|
||||
return
|
||||
ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/*
|
||||
* allgather_intra_dec
|
||||
* allgather_intra_dec
|
||||
*
|
||||
* Function: - seletects allgather algorithm to use
|
||||
* Accepts: - same arguments as MPI_Allgather()
|
||||
@ -511,10 +507,10 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
|
||||
* internal allgather function.
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
@ -525,78 +521,78 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
|
||||
|
||||
/* Special case for 2 processes */
|
||||
if (communicator_size == 2) {
|
||||
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/* Determine complete data size */
|
||||
ompi_datatype_type_size(sdtype, &dsize);
|
||||
total_dsize = dsize * (ptrdiff_t)scount * (ptrdiff_t)communicator_size;
|
||||
|
||||
total_dsize = dsize * (ptrdiff_t)scount * (ptrdiff_t)communicator_size;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed"
|
||||
" rank %d com_size %d msg_length %lu",
|
||||
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
|
||||
|
||||
pow2_size = opal_next_poweroftwo_inclusive (communicator_size);
|
||||
|
||||
/* Decision based on MX 2Gb results from Grig cluster at
|
||||
The University of Tennesse, Knoxville
|
||||
- if total message size is less than 50KB use either bruck or
|
||||
recursive doubling for non-power of two and power of two nodes,
|
||||
/* Decision based on MX 2Gb results from Grig cluster at
|
||||
The University of Tennesse, Knoxville
|
||||
- if total message size is less than 50KB use either bruck or
|
||||
recursive doubling for non-power of two and power of two nodes,
|
||||
respectively.
|
||||
- else use ring and neighbor exchange algorithms for odd and even
|
||||
- else use ring and neighbor exchange algorithms for odd and even
|
||||
number of nodes, respectively.
|
||||
*/
|
||||
if (total_dsize < 50000) {
|
||||
if (pow2_size == communicator_size) {
|
||||
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} else {
|
||||
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
} else {
|
||||
if (communicator_size % 2) {
|
||||
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} else {
|
||||
return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if defined(USE_MPICH2_DECISION)
|
||||
/* Decision as in MPICH-2
|
||||
presented in Thakur et.al. "Optimization of Collective Communication
|
||||
Operations in MPICH", International Journal of High Performance Computing
|
||||
/* Decision as in MPICH-2
|
||||
presented in Thakur et.al. "Optimization of Collective Communication
|
||||
Operations in MPICH", International Journal of High Performance Computing
|
||||
Applications, Vol. 19, No. 1, 49-66 (2005)
|
||||
- for power-of-two processes and small and medium size messages
|
||||
- for power-of-two processes and small and medium size messages
|
||||
(up to 512KB) use recursive doubling
|
||||
- for non-power-of-two processes and small messages (80KB) use bruck,
|
||||
- for everything else use ring.
|
||||
*/
|
||||
if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
|
||||
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} else if (total_dsize <= 81920) {
|
||||
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} else if (total_dsize <= 81920) {
|
||||
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
#endif /* defined(USE_MPICH2_DECISION) */
|
||||
}
|
||||
|
||||
/*
|
||||
* allgatherv_intra_dec
|
||||
* allgatherv_intra_dec
|
||||
*
|
||||
* Function: - seletects allgatherv algorithm to use
|
||||
* Accepts: - same arguments as MPI_Allgatherv()
|
||||
@ -604,59 +600,59 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
|
||||
* internal allgatherv function.
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts,
|
||||
void* rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i;
|
||||
int communicator_size;
|
||||
size_t dsize, total_dsize;
|
||||
|
||||
|
||||
communicator_size = ompi_comm_size(comm);
|
||||
|
||||
|
||||
/* Special case for 2 processes */
|
||||
if (communicator_size == 2) {
|
||||
return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
|
||||
/* Determine complete data size */
|
||||
ompi_datatype_type_size(sdtype, &dsize);
|
||||
total_dsize = 0;
|
||||
for (i = 0; i < communicator_size; i++) {
|
||||
total_dsize += dsize * (ptrdiff_t)rcounts[i];
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_allgatherv_intra_dec_fixed"
|
||||
" rank %d com_size %d msg_length %lu",
|
||||
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
|
||||
|
||||
|
||||
/* Decision based on allgather decision. */
|
||||
if (total_dsize < 50000) {
|
||||
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
} else {
|
||||
if (communicator_size % 2) {
|
||||
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
} else {
|
||||
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* gather_intra_dec
|
||||
* gather_intra_dec
|
||||
*
|
||||
* Function: - seletects gather algorithm to use
|
||||
* Accepts: - same arguments as MPI_Gather()
|
||||
@ -664,10 +660,10 @@ int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
|
||||
* internal allgather function.
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
@ -685,7 +681,7 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
int communicator_size, rank;
|
||||
size_t dsize, block_size;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_gather_intra_dec_fixed"));
|
||||
|
||||
communicator_size = ompi_comm_size(comm);
|
||||
@ -701,33 +697,32 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
}
|
||||
|
||||
if (block_size > large_block_size) {
|
||||
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
large_segment_size);
|
||||
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
large_segment_size);
|
||||
|
||||
} else if (block_size > intermediate_block_size) {
|
||||
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
small_segment_size);
|
||||
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
small_segment_size);
|
||||
|
||||
} else if ((communicator_size > large_communicator_size) ||
|
||||
((communicator_size > small_communicator_size) &&
|
||||
(block_size < small_block_size))) {
|
||||
return ompi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
|
||||
return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
/* Otherwise, use basic linear */
|
||||
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
||||
/*
|
||||
* scatter_intra_dec
|
||||
* scatter_intra_dec
|
||||
*
|
||||
* Function: - seletects scatter algorithm to use
|
||||
* Accepts: - same arguments as MPI_Scatter()
|
||||
@ -735,10 +730,10 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
* internal allgather function.
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
@ -747,7 +742,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
||||
int communicator_size, rank;
|
||||
size_t dsize, block_size;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_scatter_intra_dec_fixed"));
|
||||
|
||||
communicator_size = ompi_comm_size(comm);
|
||||
@ -759,15 +754,15 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
||||
} else {
|
||||
ompi_datatype_type_size(rdtype, &dsize);
|
||||
block_size = dsize * (ptrdiff_t)rcount;
|
||||
}
|
||||
}
|
||||
|
||||
if ((communicator_size > small_comm_size) &&
|
||||
(block_size < small_block_size)) {
|
||||
return ompi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
@ -1,21 +1,20 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -28,7 +27,7 @@
|
||||
#include "coll_tuned.h"
|
||||
|
||||
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
|
||||
/* also need the dynamic rule structures */
|
||||
#include "coll_tuned_dynamic_rules.h"
|
||||
@ -43,7 +42,7 @@ static long getnext (FILE *fptr); /* local function */
|
||||
|
||||
static int fileline=0; /* used for verbose error messages */
|
||||
|
||||
/*
|
||||
/*
|
||||
* Reads a rule file called fname
|
||||
* Builds the algorithm rule table for a max of n_collectives
|
||||
*
|
||||
@ -97,6 +96,10 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
|
||||
|
||||
/* make space and init the algorithm rules for each of the n_collectives MPI collectives */
|
||||
alg_rules = ompi_coll_tuned_mk_alg_rules (n_collectives);
|
||||
if (NULL == alg_rules) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"cannot cannot allocate rules for file [%s]\n", fname));
|
||||
goto on_file_error;
|
||||
}
|
||||
|
||||
if (NULL == alg_rules) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"cannot cannot allocate rules for file [%s]\n", fname));
|
||||
@ -127,10 +130,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
|
||||
|
||||
if (alg_rules[CI].alg_rule_id != CI) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI));
|
||||
fclose(fptr);
|
||||
ompi_coll_tuned_free_all_rules (alg_rules, n_collectives);
|
||||
*rules = (ompi_coll_alg_rule_t*) NULL;
|
||||
return (-4);
|
||||
goto on_file_error;
|
||||
}
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %d\n", CI));
|
||||
alg_p = &alg_rules[CI];
|
||||
@ -151,7 +151,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
|
||||
for (ncs=0;ncs<NCS;ncs++) { /* for each comm size */
|
||||
|
||||
com_p = &(alg_p->com_rules[ncs]);
|
||||
|
||||
|
||||
CS = (int)getnext (fptr);
|
||||
if (CS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
|
||||
@ -165,7 +165,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n",
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n",
|
||||
NMS, CI, CS));
|
||||
com_p->n_msg_sizes = NMS;
|
||||
com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS);
|
||||
@ -222,7 +222,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %d\n", CI));
|
||||
|
||||
} /* per collective */
|
||||
|
||||
|
||||
fclose (fptr);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"\nConfigure file Stats\n"));
|
||||
@ -291,4 +291,3 @@ static long getnext (FILE *fptr)
|
||||
if ('#' == trash) skiptonewline (fptr);
|
||||
} while (1);
|
||||
}
|
||||
|
||||
|
@ -2,18 +2,18 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -25,7 +25,7 @@
|
||||
#include "coll_tuned.h"
|
||||
|
||||
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
|
||||
/* also need the dynamic rule structures */
|
||||
#include "coll_tuned_dynamic_rules.h"
|
||||
@ -33,7 +33,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "coll_tuned_util.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
|
||||
|
||||
ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
|
||||
@ -43,7 +43,7 @@ ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
|
||||
|
||||
alg_rules = (ompi_coll_alg_rule_t *) calloc (n_alg, sizeof (ompi_coll_alg_rule_t));
|
||||
if (!alg_rules) return (alg_rules);
|
||||
|
||||
|
||||
/* set all we can at this point */
|
||||
for (i=0;i<n_alg;i++) {
|
||||
alg_rules[i].alg_rule_id = i;
|
||||
@ -52,7 +52,7 @@ ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
|
||||
}
|
||||
|
||||
|
||||
ompi_coll_com_rule_t* ompi_coll_tuned_mk_com_rules (int n_com_rules, int alg_rule_id)
|
||||
ompi_coll_com_rule_t* ompi_coll_tuned_mk_com_rules (int n_com_rules, int alg_rule_id)
|
||||
{
|
||||
int i;
|
||||
ompi_coll_com_rule_t * com_rules;
|
||||
@ -95,9 +95,9 @@ ompi_coll_msg_rule_t* ompi_coll_tuned_mk_msg_rules (int n_msg_rules, int alg_rul
|
||||
|
||||
|
||||
/*
|
||||
* Debug / IO routines
|
||||
* Debug / IO routines
|
||||
*
|
||||
*/
|
||||
*/
|
||||
int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p)
|
||||
{
|
||||
if (!msg_p) {
|
||||
@ -105,11 +105,11 @@ int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p)
|
||||
return (-1);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\tcom_id %3d\tcom_size %3d\tmsg_id %3d\t", msg_p->alg_rule_id,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\tcom_id %3d\tcom_size %3d\tmsg_id %3d\t", msg_p->alg_rule_id,
|
||||
msg_p->com_rule_id, msg_p->mpi_comsize, msg_p->msg_rule_id));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %10lu -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\tmax_requests %4d\n",
|
||||
msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %10lu -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\tmax_requests %4d\n",
|
||||
msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize,
|
||||
msg_p->result_max_requests));
|
||||
|
||||
return (0);
|
||||
@ -268,7 +268,7 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* query functions
|
||||
* i.e. the functions that get me the algorithm, topo fanin/out and segment size fast
|
||||
* and also get the rules that are needed by each communicator as needed
|
||||
@ -277,7 +277,7 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
|
||||
|
||||
/*
|
||||
* This function is used to get the pointer to the nearest (less than or equal)
|
||||
* com rule for this MPI collective (alg_id) for a given
|
||||
* com rule for this MPI collective (alg_id) for a given
|
||||
* MPI communicator size. The complete rule base must be presented.
|
||||
*
|
||||
* If no rule exits returns NULL, else the com rule ptr
|
||||
@ -302,7 +302,7 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
|
||||
}
|
||||
|
||||
/* ok have some com sizes, now to find the one closest to my mpi_comsize */
|
||||
|
||||
|
||||
/* make a copy of the first com rule */
|
||||
best_com_p = com_p = alg_p->com_rules;
|
||||
i = best = 0;
|
||||
@ -324,13 +324,13 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
|
||||
return (best_com_p);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function takes a com_rule ptr (from the communicators coll tuned data structure)
|
||||
/*
|
||||
* This function takes a com_rule ptr (from the communicators coll tuned data structure)
|
||||
* (Which is chosen for a particular MPI collective)
|
||||
* and a (total_)msg_size and it returns (0) and a algorithm to use and a recommended topo faninout and segment size
|
||||
* all based on the user supplied rules
|
||||
*
|
||||
* Just like the above functions it uses a less than or equal msg size
|
||||
* Just like the above functions it uses a less than or equal msg size
|
||||
* (hense config file must have a default defined for '0' if we reach this point)
|
||||
* else if no rules match we return '0' + '0,0' or used fixed decision table with no topo chand and no segmentation
|
||||
* of users data.. shame.
|
||||
@ -339,7 +339,7 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
|
||||
*
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, size_t mpi_msgsize, int *result_topo_faninout,
|
||||
int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, size_t mpi_msgsize, int *result_topo_faninout,
|
||||
int* result_segsize, int* max_requests)
|
||||
{
|
||||
ompi_coll_msg_rule_t* msg_p = (ompi_coll_msg_rule_t*) NULL;
|
||||
@ -352,7 +352,7 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul
|
||||
}
|
||||
|
||||
/* ok have some msg sizes, now to find the one closest to my mpi_msgsize */
|
||||
|
||||
|
||||
/* make a copy of the first msg rule */
|
||||
best_msg_p = msg_p = base_com_rule->msg_rules;
|
||||
i = best = 0;
|
||||
@ -387,6 +387,5 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul
|
||||
*max_requests = best_msg_p->result_max_requests;
|
||||
|
||||
/* return the algorithm/method to use */
|
||||
return (best_msg_p->result_alg);
|
||||
return (best_msg_p->result_alg);
|
||||
}
|
||||
|
||||
|
198
ompi/mca/coll/tuned/coll_tuned_gather_decision.c
Обычный файл
198
ompi/mca/coll/tuned/coll_tuned_gather_decision.c
Обычный файл
@ -0,0 +1,198 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
|
||||
/* gather algorithm variables */
|
||||
static int coll_tuned_gather_forced_algorithm = 0;
|
||||
static int coll_tuned_gather_segment_size = 0;
|
||||
static int coll_tuned_gather_tree_fanout;
|
||||
static int coll_tuned_gather_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_gather_forced_algorithm */
|
||||
static mca_base_var_enum_value_t gather_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "binomial"},
|
||||
{3, "linear_sync"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map
|
||||
routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values
|
||||
and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int
|
||||
ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != gather_algorithms[cnt].string; cnt++ );
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[GATHER] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm_count",
|
||||
"Number of gather algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_gather_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_gather_algorithms", gather_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm",
|
||||
"Which gather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 linear with synchronization.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_gather_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_gather_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for gather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_gather_segment_size);
|
||||
|
||||
coll_tuned_gather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for gather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_gather_tree_fanout);
|
||||
|
||||
coll_tuned_gather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"gather_algorithm_chain_fanout",
|
||||
"Fanout for chains used for gather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_gather_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:gather_intra_do_forced selected algorithm %d",
|
||||
tuned_module->user_forced[GATHER].algorithm));
|
||||
|
||||
switch (tuned_module->user_forced[GATHER].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_gather_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
tuned_module->user_forced[GATHER].segsize);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[GATHER].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[GATHER]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_gather_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (3):
|
||||
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[GATHER]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
@ -2,18 +2,18 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -26,13 +26,13 @@
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_dynamic_rules.h"
|
||||
#include "coll_tuned_dynamic_file.h"
|
||||
|
||||
static int tuned_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm);
|
||||
struct ompi_communicator_t *comm);
|
||||
/*
|
||||
* Initial query function that is invoked during MPI_INIT, allowing
|
||||
* this component to disqualify itself if it doesn't support the
|
||||
@ -79,8 +79,8 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
|
||||
*priority = ompi_coll_tuned_priority;
|
||||
|
||||
/*
|
||||
* Choose whether to use [intra|inter] decision functions
|
||||
/*
|
||||
* Choose whether to use [intra|inter] decision functions
|
||||
* and if using fixed OR dynamic rule sets.
|
||||
* Right now you cannot mix them, maybe later on it can be changed
|
||||
* but this would probably add an extra if and funct call to the path
|
||||
@ -114,9 +114,9 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
|
||||
/* We put all routines that handle the MCA user forced algorithm and parameter choices here */
|
||||
/* recheck the setting of forced, called on module create (i.e. for each new comm) */
|
||||
|
||||
|
||||
static int
|
||||
ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
|
||||
ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
|
||||
coll_tuned_force_algorithm_params_t *forced_values )
|
||||
{
|
||||
coll_tuned_force_algorithm_mca_param_indices_t* mca_params;
|
||||
@ -145,20 +145,20 @@ ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
#define COLL_TUNED_EXECUTE_IF_DYNAMIC(DATA, TYPE, EXECUTE) \
|
||||
#define COLL_TUNED_EXECUTE_IF_DYNAMIC(TMOD, TYPE, EXECUTE) \
|
||||
{ \
|
||||
int need_dynamic_decision = 0; \
|
||||
ompi_coll_tuned_forced_getvalues( (TYPE), &((DATA)->user_forced[(TYPE)]) ); \
|
||||
(DATA)->com_rules[(TYPE)] = NULL; \
|
||||
if( 0 != (DATA)->user_forced[(TYPE)].algorithm ) { \
|
||||
ompi_coll_tuned_forced_getvalues( (TYPE), &((TMOD)->user_forced[(TYPE)]) ); \
|
||||
(TMOD)->com_rules[(TYPE)] = NULL; \
|
||||
if( 0 != (TMOD)->user_forced[(TYPE)].algorithm ) { \
|
||||
need_dynamic_decision = 1; \
|
||||
EXECUTE; \
|
||||
} \
|
||||
if( NULL != mca_coll_tuned_component.all_base_rules ) { \
|
||||
(DATA)->com_rules[(TYPE)] \
|
||||
(TMOD)->com_rules[(TYPE)] \
|
||||
= ompi_coll_tuned_get_com_rule_ptr( mca_coll_tuned_component.all_base_rules, \
|
||||
(TYPE), size ); \
|
||||
if( NULL != (DATA)->com_rules[(TYPE)] ) { \
|
||||
if( NULL != (TMOD)->com_rules[(TYPE)] ) { \
|
||||
need_dynamic_decision = 1; \
|
||||
} \
|
||||
} \
|
||||
@ -178,7 +178,7 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
{
|
||||
int size;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t *) module;
|
||||
mca_coll_tuned_comm_t *data = NULL;
|
||||
mca_coll_base_comm_t *data = NULL;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called."));
|
||||
|
||||
@ -191,32 +191,19 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
|
||||
/**
|
||||
* we still malloc data as it is used by the TUNED modules
|
||||
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
|
||||
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
|
||||
* we place any special info after the default data
|
||||
*
|
||||
* BUT on very large systems we might not be able to allocate all this memory so
|
||||
* we do check a MCA parameter to see if if we should allocate this memory
|
||||
*
|
||||
* The default is set very high
|
||||
*
|
||||
* The default is set very high
|
||||
*/
|
||||
|
||||
/* if we within the memory/size limit, allow preallocated data */
|
||||
if( size <= ompi_coll_tuned_preallocate_memory_comm_size_limit ) {
|
||||
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t) +
|
||||
(sizeof(ompi_request_t *) * size * 2));
|
||||
if (NULL == data) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
data->mcct_reqs = (ompi_request_t **) (data + 1);
|
||||
data->mcct_num_reqs = size * 2;
|
||||
} else {
|
||||
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t));
|
||||
if (NULL == data) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
data->mcct_reqs = (ompi_request_t **) NULL;
|
||||
data->mcct_num_reqs = 0;
|
||||
data = OBJ_NEW(mca_coll_base_comm_t);
|
||||
if (NULL == data) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
@ -230,37 +217,37 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
* next dynamic state, recheck all forced rules as well
|
||||
* warning, we should check to make sure this is really an INTRA comm here...
|
||||
*/
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLGATHER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLGATHER,
|
||||
tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLGATHERV,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLGATHERV,
|
||||
tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLREDUCE,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLREDUCE,
|
||||
tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALL,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLTOALL,
|
||||
tuned_module->super.coll_alltoall = ompi_coll_tuned_alltoall_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALLV,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLTOALLV,
|
||||
tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALLW,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLTOALLW,
|
||||
tuned_module->super.coll_alltoallw = NULL);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, BARRIER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, BARRIER,
|
||||
tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, BCAST,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, BCAST,
|
||||
tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, EXSCAN,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, EXSCAN,
|
||||
tuned_module->super.coll_exscan = NULL);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, GATHER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, GATHER,
|
||||
tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, GATHERV,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, GATHERV,
|
||||
tuned_module->super.coll_gatherv = NULL);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, REDUCE,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, REDUCE,
|
||||
tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, REDUCESCATTER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, REDUCESCATTER,
|
||||
tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCAN,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCAN,
|
||||
tuned_module->super.coll_scan = NULL);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCATTER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCATTER,
|
||||
tuned_module->super.coll_scatter = ompi_coll_tuned_scatter_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCATTERV,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCATTERV,
|
||||
tuned_module->super.coll_scatterv = NULL);
|
||||
|
||||
if( false == ompi_coll_tuned_use_dynamic_rules ) {
|
||||
@ -269,7 +256,7 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
" decision by lack of dynamic rules"));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* general n fan out tree */
|
||||
data->cached_ntree = NULL;
|
||||
/* binary tree */
|
||||
@ -286,7 +273,7 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
data->cached_in_order_bintree = NULL;
|
||||
|
||||
/* All done */
|
||||
tuned_module->tuned_data = data;
|
||||
tuned_module->super.base_data = data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use"));
|
||||
return OMPI_SUCCESS;
|
||||
|
222
ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
Обычный файл
222
ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
Обычный файл
@ -0,0 +1,222 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "coll_tuned.h"
|
||||
|
||||
/* reduce algorithm variables */
|
||||
static int coll_tuned_reduce_forced_algorithm = 0;
|
||||
static int coll_tuned_reduce_segment_size = 0;
|
||||
static int coll_tuned_reduce_max_requests;
|
||||
static int coll_tuned_reduce_tree_fanout;
|
||||
static int coll_tuned_reduce_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_reduce_forced_algorithm */
|
||||
static mca_base_var_enum_value_t reduce_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "linear"},
|
||||
{2, "chain"},
|
||||
{3, "pipeline"},
|
||||
{4, "binary"},
|
||||
{5, "binomial"},
|
||||
{6, "in-order_binary"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/**
|
||||
* The following are used by dynamic and forced rules
|
||||
*
|
||||
* publish details of each algorithm and if its forced/fixed/locked in
|
||||
* as you add methods/algorithms you must update this and the query/map routines
|
||||
*
|
||||
* this routine is called by the component only
|
||||
* this makes sure that the mca parameters are set to their initial values and
|
||||
* perms module does not call this they call the forced_getvalues routine
|
||||
* instead.
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t*new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != reduce_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[REDUCE] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_count",
|
||||
"Number of reduce algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_reduce_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_reduce_algorithms", reduce_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm",
|
||||
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_reduce_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_segment_size);
|
||||
|
||||
coll_tuned_reduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_tree_fanout);
|
||||
|
||||
coll_tuned_reduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_chain_fanout);
|
||||
|
||||
coll_tuned_reduce_max_requests = 0; /* no limit for reduce by default */
|
||||
mca_param_indices->max_requests_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_max_requests",
|
||||
"Maximum number of outstanding send requests on leaf nodes. 0 means no limit.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_max_requests);
|
||||
if (mca_param_indices->max_requests_param_index < 0) {
|
||||
return mca_param_indices->max_requests_param_index;
|
||||
}
|
||||
|
||||
if (coll_tuned_reduce_max_requests < 0) {
|
||||
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
||||
opal_output( 0, "Maximum outstanding requests must be positive number or 0. Initializing to 0 (no limit).\n" );
|
||||
}
|
||||
coll_tuned_reduce_max_requests = 0;
|
||||
}
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
const int segsize = tuned_module->user_forced[REDUCE].segsize;
|
||||
const int chain_fanout = tuned_module->user_forced[REDUCE].chain_fanout;
|
||||
const int max_requests = tuned_module->user_forced[REDUCE].max_requests;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d",
|
||||
tuned_module->user_forced[REDUCE].algorithm));
|
||||
|
||||
|
||||
switch (tuned_module->user_forced[REDUCE].algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module);
|
||||
case (1): return ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module);
|
||||
case (2): return ompi_coll_base_reduce_intra_chain(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, chain_fanout, max_requests);
|
||||
case (3): return ompi_coll_base_reduce_intra_pipeline(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
case (4): return ompi_coll_base_reduce_intra_binary(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
case (5): return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
case (6): return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout,
|
||||
int segsize, int max_requests )
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module);
|
||||
case (1): return ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module);
|
||||
case (2): return ompi_coll_base_reduce_intra_chain(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, faninout, max_requests);
|
||||
case (3): return ompi_coll_base_reduce_intra_pipeline(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
case (4): return ompi_coll_base_reduce_intra_binary(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
case (5): return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
case (6): return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
|
||||
op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
173
ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
Обычный файл
173
ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
Обычный файл
@ -0,0 +1,173 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/op/op.h"
|
||||
#include "coll_tuned.h"
|
||||
|
||||
/* reduce_scatter algorithm variables */
|
||||
static int coll_tuned_reduce_scatter_forced_algorithm = 0;
|
||||
static int coll_tuned_reduce_scatter_segment_size = 0;
|
||||
static int coll_tuned_reduce_scatter_tree_fanout;
|
||||
static int coll_tuned_reduce_scatter_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_reduce_scatter_forced_algorithm */
|
||||
static mca_base_var_enum_value_t reduce_scatter_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "non-overlapping"},
|
||||
{2, "recursive_halfing"},
|
||||
{3, "ring"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/**
|
||||
* The following are used by dynamic and forced rules
|
||||
*
|
||||
* publish details of each algorithm and if its forced/fixed/locked in
|
||||
* as you add methods/algorithms you must update this and the query/map routines
|
||||
*
|
||||
* this routine is called by the component only
|
||||
* this makes sure that the mca parameters are set to their initial values and
|
||||
* perms module does not call this they call the forced_getvalues routine
|
||||
* instead
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != reduce_scatter_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm_count",
|
||||
"Number of reduce_scatter algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_reduce_scatter_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_reduce_scatter_algorithms", reduce_scatter_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm",
|
||||
"Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_scatter_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_reduce_scatter_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_scatter_segment_size);
|
||||
|
||||
coll_tuned_reduce_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_scatter_tree_fanout);
|
||||
|
||||
coll_tuned_reduce_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_scatter_algorithm_chain_fanout",
|
||||
"Fanout for chains used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_reduce_scatter_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_reduce_scatter_intra_do_forced(void *sbuf, void* rbuf,
|
||||
int *rcounts,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced selected algorithm %d",
|
||||
tuned_module->user_forced[REDUCESCATTER].algorithm));
|
||||
|
||||
switch (tuned_module->user_forced[REDUCESCATTER].algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (1): return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (2): return ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (3): return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_reduce_scatter_intra_do_this(void *sbuf, void* rbuf,
|
||||
int *rcounts,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (1): return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (2): return ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
case (3): return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
|
||||
dtype, op, comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
|
||||
return (MPI_ERR_ARG);
|
||||
}
|
@ -1,421 +0,0 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_util.h"
|
||||
|
||||
/* scatter algorithm variables */
|
||||
static int coll_tuned_scatter_algorithm_count = 2;
|
||||
static int coll_tuned_scatter_forced_algorithm = 0;
|
||||
static int coll_tuned_scatter_segment_size = 0;
|
||||
static int coll_tuned_scatter_tree_fanout;
|
||||
static int coll_tuned_scatter_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_scatter_forced_algorithm */
|
||||
static mca_base_var_enum_value_t scatter_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "binomial"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
int
|
||||
ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int line = -1, i, rank, vrank, size, total_send = 0, err;
|
||||
char *ptmp, *tempbuf = NULL;
|
||||
ompi_coll_tree_t* bmtree;
|
||||
MPI_Status status;
|
||||
MPI_Aint sextent, slb, strue_lb, strue_extent;
|
||||
MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
size = ompi_comm_size(comm);
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_scatter_intra_binomial rank %d", rank));
|
||||
|
||||
/* create the binomial tree */
|
||||
COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
|
||||
bmtree = data->cached_in_order_bmtree;
|
||||
|
||||
ompi_datatype_get_extent(sdtype, &slb, &sextent);
|
||||
ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent);
|
||||
ompi_datatype_get_extent(rdtype, &rlb, &rextent);
|
||||
ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent);
|
||||
|
||||
vrank = (rank - root + size) % size;
|
||||
ptmp = (char *) rbuf; /* by default suppose leaf nodes, just use rbuf */
|
||||
|
||||
if (rank == root) {
|
||||
if (0 == root) {
|
||||
/* root on 0, just use the send buffer */
|
||||
ptmp = (char *) sbuf;
|
||||
if (rbuf != MPI_IN_PLACE) {
|
||||
/* local copy to rbuf */
|
||||
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
} else {
|
||||
/* root is not on 0, allocate temp buffer for send */
|
||||
tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent);
|
||||
if (NULL == tempbuf) {
|
||||
err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
||||
}
|
||||
|
||||
ptmp = tempbuf - strue_lb;
|
||||
|
||||
/* and rotate data so they will eventually in the right place */
|
||||
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root),
|
||||
ptmp, (char *) sbuf + sextent * (ptrdiff_t)root * (ptrdiff_t)scount);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
|
||||
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)root,
|
||||
ptmp + sextent * (ptrdiff_t)scount * (ptrdiff_t)(size - root), (char *)sbuf);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
if (rbuf != MPI_IN_PLACE) {
|
||||
/* local copy to rbuf */
|
||||
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
|
||||
rbuf, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
}
|
||||
total_send = scount;
|
||||
} else if (!(vrank % 2)) {
|
||||
/* non-root, non-leaf nodes, allocte temp buffer for recv
|
||||
* the most we need is rcount*size/2 */
|
||||
tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent);
|
||||
if (NULL == tempbuf) {
|
||||
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
||||
}
|
||||
|
||||
ptmp = tempbuf - rtrue_lb;
|
||||
|
||||
sdtype = rdtype;
|
||||
scount = rcount;
|
||||
sextent = rextent;
|
||||
total_send = scount;
|
||||
}
|
||||
|
||||
if (!(vrank % 2)) {
|
||||
if (rank != root) {
|
||||
/* recv from parent on non-root */
|
||||
err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)rcount * (ptrdiff_t)size, rdtype, bmtree->tree_prev,
|
||||
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
/* local copy to rbuf */
|
||||
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
|
||||
rbuf, rcount, rdtype);
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
/* send to children on all non-leaf */
|
||||
for (i = 0; i < bmtree->tree_nextsize; i++) {
|
||||
size_t mycount = 0;
|
||||
int vkid;
|
||||
/* figure out how much data I have to send to this child */
|
||||
vkid = (bmtree->tree_next[i] - root + size) % size;
|
||||
mycount = vkid - vrank;
|
||||
if( (int)mycount > (size - vkid) )
|
||||
mycount = size - vkid;
|
||||
mycount *= scount;
|
||||
|
||||
err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)total_send * sextent, mycount, sdtype,
|
||||
bmtree->tree_next[i],
|
||||
MCA_COLL_BASE_TAG_SCATTER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
total_send += mycount;
|
||||
}
|
||||
|
||||
if (NULL != tempbuf)
|
||||
free(tempbuf);
|
||||
} else {
|
||||
/* recv from parent on leaf nodes */
|
||||
err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, bmtree->tree_prev,
|
||||
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
|
||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
if (NULL != tempbuf)
|
||||
free(tempbuf);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||
__FILE__, line, err, rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Linear functions are copied from the BASIC coll module
|
||||
* they do not segment the message and are simple implementations
|
||||
* but for some small number of nodes and/or small data sizes they
|
||||
* are just as fast as tuned/tree based segmenting operations
|
||||
* and as such may be selected by the decision functions
|
||||
* These are copied into this module due to the way we select modules
|
||||
* in V1. i.e. in V2 we will handle this differently and so will not
|
||||
* have to duplicate code.
|
||||
* JPG following the examples from other coll_tuned implementations. Dec06.
|
||||
*/
|
||||
|
||||
/* copied function (with appropriate renaming) starts here */
|
||||
/*
|
||||
* scatter_intra
|
||||
*
|
||||
* Function: - basic scatter operation
|
||||
* Accepts: - same arguments as MPI_Scatter()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void *rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i, rank, size, err;
|
||||
ptrdiff_t lb, incr;
|
||||
char *ptmp;
|
||||
|
||||
/* Initialize */
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
size = ompi_comm_size(comm);
|
||||
|
||||
/* If not root, receive data. */
|
||||
|
||||
if (rank != root) {
|
||||
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
|
||||
MCA_COLL_BASE_TAG_SCATTER,
|
||||
comm, MPI_STATUS_IGNORE));
|
||||
return err;
|
||||
}
|
||||
|
||||
/* I am the root, loop sending data. */
|
||||
|
||||
err = ompi_datatype_get_extent(sdtype, &lb, &incr);
|
||||
if (OMPI_SUCCESS != err) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
incr *= scount;
|
||||
for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {
|
||||
|
||||
/* simple optimization */
|
||||
|
||||
if (i == rank) {
|
||||
if (MPI_IN_PLACE != rbuf) {
|
||||
err =
|
||||
ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
|
||||
rdtype);
|
||||
}
|
||||
} else {
|
||||
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
|
||||
MCA_COLL_BASE_TAG_SCATTER,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
}
|
||||
if (MPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* All done */
|
||||
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* copied function (with appropriate renaming) ends here */
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map
|
||||
routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values
|
||||
and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int
|
||||
ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[SCATTER] = coll_tuned_scatter_algorithm_count;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm_count",
|
||||
"Number of scatter algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&coll_tuned_scatter_algorithm_count);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_scatter_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_scatter_algorithms", scatter_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm",
|
||||
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_scatter_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_scatter_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_scatter_segment_size);
|
||||
|
||||
coll_tuned_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_scatter_tree_fanout);
|
||||
|
||||
coll_tuned_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index=
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm_chain_fanout",
|
||||
"Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_scatter_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_scatter_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:scatter_intra_do_forced selected algorithm %d",
|
||||
data->user_forced[SCATTER].algorithm));
|
||||
|
||||
switch (data->user_forced[SCATTER].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
data->user_forced[SCATTER].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_scatter_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
}
|
185
ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
Обычный файл
185
ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
Обычный файл
@ -0,0 +1,185 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/datatype/ompi_datatype.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "coll_tuned.h"
|
||||
|
||||
/* scatter algorithm variables */
|
||||
static int coll_tuned_scatter_forced_algorithm = 0;
|
||||
static int coll_tuned_scatter_segment_size = 0;
|
||||
static int coll_tuned_scatter_tree_fanout;
|
||||
static int coll_tuned_scatter_chain_fanout;
|
||||
|
||||
/* valid values for coll_tuned_scatter_forced_algorithm */
|
||||
static mca_base_var_enum_value_t scatter_algorithms[] = {
|
||||
{0, "ignore"},
|
||||
{1, "basic_linear"},
|
||||
{2, "binomial"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
/* The following are used by dynamic and forced rules */
|
||||
|
||||
/* publish details of each algorithm and if its forced/fixed/locked in */
|
||||
/* as you add methods/algorithms you must update this and the query/map
|
||||
routines */
|
||||
|
||||
/* this routine is called by the component only */
|
||||
/* this makes sure that the mca parameters are set to their initial values
|
||||
and perms */
|
||||
/* module does not call this they call the forced_getvalues routine instead */
|
||||
|
||||
int
|
||||
ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
|
||||
{
|
||||
mca_base_var_enum_t *new_enum;
|
||||
int cnt;
|
||||
|
||||
for( cnt = 0; NULL != scatter_algorithms[cnt].string; cnt++ );
|
||||
ompi_coll_tuned_forced_max_algorithms[SCATTER] = cnt;
|
||||
|
||||
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm_count",
|
||||
"Number of scatter algorithms available",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0,
|
||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
||||
&cnt);
|
||||
|
||||
/* MPI_T: This variable should eventually be bound to a communicator */
|
||||
coll_tuned_scatter_forced_algorithm = 0;
|
||||
(void) mca_base_var_enum_create("coll_tuned_scatter_algorithms", scatter_algorithms, &new_enum);
|
||||
mca_param_indices->algorithm_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm",
|
||||
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
|
||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_scatter_forced_algorithm);
|
||||
OBJ_RELEASE(new_enum);
|
||||
if (mca_param_indices->algorithm_param_index < 0) {
|
||||
return mca_param_indices->algorithm_param_index;
|
||||
}
|
||||
|
||||
coll_tuned_scatter_segment_size = 0;
|
||||
mca_param_indices->segsize_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_scatter_segment_size);
|
||||
|
||||
coll_tuned_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
|
||||
mca_param_indices->tree_fanout_param_index =
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_scatter_tree_fanout);
|
||||
|
||||
coll_tuned_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
|
||||
mca_param_indices->chain_fanout_param_index=
|
||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||
"scatter_algorithm_chain_fanout",
|
||||
"Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_5,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&coll_tuned_scatter_chain_fanout);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_scatter_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:scatter_intra_do_forced selected algorithm %d",
|
||||
tuned_module->user_forced[SCATTER].algorithm));
|
||||
|
||||
switch (tuned_module->user_forced[SCATTER].algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_scatter_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
tuned_module->user_forced[SCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[SCATTER]));
|
||||
return MPI_ERR_ARG;
|
||||
}
|
||||
|
||||
int
|
||||
ompi_coll_tuned_scatter_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
case (0):
|
||||
return ompi_coll_tuned_scatter_intra_dec_fixed(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (1):
|
||||
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
case (2):
|
||||
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
} /* switch */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[SCATTER]));
|
||||
return MPI_ERR_ARG;
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user