1
1

Merge pull request #423 from ICLDisco/tuned

Dismantle the Tuned collective
Этот коммит содержится в:
Howard Pritchard 2015-02-26 16:19:43 -07:00
родитель 5215dc0db3 ced44e12da
Коммит cf56c6a9f2
51 изменённых файлов: 4420 добавлений и 5126 удалений

Просмотреть файл

@ -2,7 +2,7 @@
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# Copyright (c) 2004-2015 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -20,10 +20,26 @@ dist_ompidata_DATA = base/help-mca-coll-base.txt
headers += \
base/base.h \
base/coll_tags.h
base/coll_tags.h \
base/coll_base_topo.h \
base/coll_base_util.h \
base/coll_base_functions.h
libmca_coll_la_SOURCES += \
base/coll_base_comm_select.c \
base/coll_base_comm_unselect.c \
base/coll_base_find_available.c \
base/coll_base_frame.c
base/coll_base_frame.c \
base/coll_base_bcast.c \
base/coll_base_scatter.c \
base/coll_base_topo.c \
base/coll_base_allgather.c \
base/coll_base_allgatherv.c \
base/coll_base_util.c \
base/coll_base_allreduce.c \
base/coll_base_alltoall.c \
base/coll_base_gather.c \
base/coll_base_alltoallv.c \
base/coll_base_reduce.c \
base/coll_base_barrier.c \
base/coll_base_reduce_scatter.c

Просмотреть файл

@ -87,7 +87,7 @@ int mca_coll_base_find_available(bool enable_progress_threads,
* coll component needs to be selected for it. It should be invoked
* near the end of the communicator creation process such that
* almost everything else is functional on the communicator (e.g.,
* point-to-point communication).
* point-to-point communication).
*
* Note that new communicators may be created as a result of
* invoking this function. Specifically: this function is called in

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,31 +30,12 @@
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* allgather algorithm variables */
static int coll_tuned_allgather_algorithm_count = 6;
static int coll_tuned_allgather_forced_algorithm = 0;
static int coll_tuned_allgather_segment_size = 0;
static int coll_tuned_allgather_tree_fanout;
static int coll_tuned_allgather_chain_fanout;
/* valid values for coll_tuned_allgather_forced_algorithm */
static mca_base_var_enum_value_t allgather_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "bruck"},
{3, "recursive_doubling"},
{4, "ring"},
{5, "neighbor"},
{6, "two_proc"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/*
* ompi_coll_tuned_allgather_intra_bruck
* ompi_coll_base_allgather_intra_bruck
*
* Function: allgather using O(log(N)) steps.
* Accepts: Same arguments as MPI_Allgather
@ -65,7 +46,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = {
* in Multiport Message-Passing Systems"
* Memory requirements: non-zero ranks require shift buffer to perform final
* step in the algorithm.
*
*
* Example on 6 nodes:
* Initialization: everyone has its own buffer at location 0 in rbuf
* This means if user specified MPI_IN_PLACE for sendbuf
@ -84,7 +65,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = {
* [2] [3] [4] [5] [0] [1]
* [3] [4] [5] [0] [1] [2]
* Step 2: send message to (rank - 2^2), receive message from (rank + 2^2)
* message size is "all remaining blocks"
* message size is "all remaining blocks"
* # 0 1 2 3 4 5
* [0] [1] [2] [3] [4] [5]
* [1] [2] [3] [4] [5] [0]
@ -101,7 +82,7 @@ static mca_base_var_enum_value_t allgather_algorithms[] = {
* [4] [4] [4] [4] [4] [4]
* [5] [5] [5] [5] [5] [5]
*/
int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
int ompi_coll_base_allgather_intra_bruck(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -115,8 +96,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_bruck rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_bruck rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -125,7 +106,7 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Initialization step:
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
receive buffer, else
- if rank r != 0, copy r^th block from receive buffer to block 0.
*/
@ -140,15 +121,15 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend);
if (err < 0) { line = __LINE__; goto err_hndl; }
}
/* Communication step:
At every step i, rank r:
- doubles the distance
- sends message which starts at begining of rbuf and has size
- sends message which starts at begining of rbuf and has size
(blockcount * rcount) to rank (r - distance)
- receives message of size blockcount * rcount from rank (r + distance)
at location (rbuf + distance * rcount * rext)
- blockcount doubles until last step when only the remaining data is
- blockcount doubles until last step when only the remaining data is
exchanged.
*/
blockcount = 1;
@ -162,14 +143,14 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
if (distance <= (size >> 1)) {
blockcount = distance;
} else {
} else {
blockcount = size - distance;
}
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, blockcount * rcount, rdtype,
err = ompi_coll_base_sendrecv(tmpsend, blockcount * rcount, rdtype,
sendto, MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, blockcount * rcount, rdtype,
tmprecv, blockcount * rcount, rdtype,
recvfrom, MCA_COLL_BASE_TAG_ALLGATHER,
comm, MPI_STATUS_IGNORE, rank);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -178,8 +159,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
/* Finalization step:
On all nodes except 0, data needs to be shifted locally:
- create temporary shift buffer,
see discussion in coll_basic_reduce.c about the size and begining
- create temporary shift buffer,
see discussion in coll_basic_reduce.c about the size and begining
of temporary buffer.
- copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer
- move blocks [(size - rank) .. size] from rbuf to begining of rbuf
@ -195,8 +176,8 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
free_buf = (char*) calloc(((true_extent +
((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount - 1) * rext)),
sizeof(char));
if (NULL == free_buf) {
line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl;
if (NULL == free_buf) {
line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl;
}
shift_buf = free_buf - true_lb;
@ -207,13 +188,13 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
/* 2. move blocks [(size - rank) .. size] from rbuf to the begining of rbuf */
tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount * rext;
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount,
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount,
rbuf, tmpsend);
if (err < 0) { line = __LINE__; goto err_hndl; }
/* 3. copy blocks from shift buffer back to rbuf starting at block [rank]. */
tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount,
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount,
tmprecv, shift_buf);
if (err < 0) { line = __LINE__; goto err_hndl; }
@ -223,13 +204,13 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* ompi_coll_tuned_allgather_intra_recursivedoubling
* ompi_coll_base_allgather_intra_recursivedoubling
*
* Function: allgather using O(log(N)) steps.
* Accepts: Same arguments as MPI_Allgather
@ -239,29 +220,29 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
* This algorithm is used in MPICH-2 for small- and medium-sized
* messages on power-of-two processes.
*
* Limitation: Current implementation only works on power-of-two number of
* processes.
* Limitation: Current implementation only works on power-of-two number of
* processes.
* In case this algorithm is invoked on non-power-of-two
* processes, Bruck algorithm will be invoked.
*
*
* Memory requirements:
* No additional memory requirements beyond user-supplied buffers.
*
*
* Example on 4 nodes:
* Initialization: everyone has its own buffer at location rank in rbuf
* # 0 1 2 3
* # 0 1 2 3
* [0] [ ] [ ] [ ]
* [ ] [1] [ ] [ ]
* [ ] [ ] [2] [ ]
* [ ] [ ] [ ] [3]
* Step 0: exchange data with (rank ^ 2^0)
* # 0 1 2 3
* # 0 1 2 3
* [0] [0] [ ] [ ]
* [1] [1] [ ] [ ]
* [ ] [ ] [2] [2]
* [ ] [ ] [3] [3]
* Step 1: exchange data with (rank ^ 2^1) (if you can)
* # 0 1 2 3
* # 0 1 2 3
* [0] [0] [0] [0]
* [1] [1] [1] [1]
* [2] [2] [2] [2]
@ -269,12 +250,12 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount,
*
* TODO: Modify the algorithm to work with any number of nodes.
* We can modify code to use identical implementation like MPICH-2:
* - using recursive-halving algorithm, at the end of each step,
* - using recursive-halving algorithm, at the end of each step,
* determine if there are nodes who did not exchange their data in that
* step, and send them appropriate messages.
*/
int
ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
int
ompi_coll_base_allgather_intra_recursivedoubling(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -293,21 +274,21 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
pow2size >>=1;
/* Current implementation only handles power-of-two number of processes.
If the function was called on non-power-of-two number of processes,
If the function was called on non-power-of-two number of processes,
print warning and call bruck allgather algorithm with same parameters.
*/
if (pow2size != size) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm",
size));
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_recursivedoubling rank %d, size %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_recursivedoubling rank %d, size %d",
rank, size));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
@ -317,7 +298,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Initialization step:
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
- if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of
receive buffer
*/
if (MPI_IN_PLACE != sbuf) {
@ -326,8 +307,8 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
/* Communication step:
At every step i, rank r:
- exchanges message with rank remote = (r ^ 2^i).
@ -347,7 +328,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
}
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
remote, MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype,
remote, MCA_COLL_BASE_TAG_ALLGATHER,
@ -359,7 +340,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -367,7 +348,7 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
/*
* ompi_coll_tuned_allgather_intra_ring
* ompi_coll_base_allgather_intra_ring
*
* Function: allgather using O(N) steps.
* Accepts: Same arguments as MPI_Allgather
@ -379,9 +360,9 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount,
* (r + 1) containing data from rank (r - i), with wrap arounds.
* Memory requirements:
* No additional memory requirements.
*
*
*/
int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
int ompi_coll_base_allgather_intra_ring(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -395,8 +376,8 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_ring rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_ring rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -413,15 +394,15 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
tmpsend = (char*) sbuf;
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
/* Communication step:
At every step i: 0 .. (P-1), rank r:
- receives message from [(r - 1 + size) % size] containing data from rank
[(r - i - 1 + size) % size]
- sends message to rank [(r + 1) % size] containing data from rank
[(r - i + size) % size]
- sends message which starts at begining of rbuf and has size
- sends message which starts at begining of rbuf and has size
*/
sendto = (rank + 1) % size;
recvfrom = (rank - 1 + size) % size;
@ -434,7 +415,7 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
tmpsend = (char*)rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)rcount * rext;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, sendto,
err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, sendto,
MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, rcount, rdtype, recvfrom,
MCA_COLL_BASE_TAG_ALLGATHER,
@ -446,34 +427,34 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* ompi_coll_tuned_allgather_intra_neighborexchange
* ompi_coll_base_allgather_intra_neighborexchange
*
* Function: allgather using N/2 steps (O(N))
* Accepts: Same arguments as MPI_Allgather
* Returns: MPI_SUCCESS or error code
*
* Description: Neighbor Exchange algorithm for allgather.
* Described by Chen et.al. in
* "Performance Evaluation of Allgather Algorithms on
* Described by Chen et.al. in
* "Performance Evaluation of Allgather Algorithms on
* Terascale Linux Cluster with Fast Ethernet",
* Proceedings of the Eighth International Conference on
* Proceedings of the Eighth International Conference on
* High-Performance Computing inn Asia-Pacific Region
* (HPCASIA'05), 2005
*
*
* Rank r exchanges message with one of its neighbors and
* forwards the data further in the next step.
*
* No additional memory requirements.
*
*
* Limitations: Algorithm works only on even number of processes.
* For odd number of processes we switch to ring algorithm.
*
*
* Example on 6 nodes:
* Initial state
* # 0 1 2 3 4 5
@ -508,8 +489,8 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount,
* [4] [4] [4] [4] [4] [4]
* [5] [5] [5] [5] [5] [5]
*/
int
ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
int
ompi_coll_base_allgather_intra_neighborexchange(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -525,16 +506,16 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
if (size % 2) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
size));
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_neighborexchange rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_neighborexchange rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -551,7 +532,7 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
tmpsend = (char*) sbuf;
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
/* Determine neighbors, order in which blocks will arrive, etc. */
even_rank = !(rank % 2);
@ -573,15 +554,15 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
/* Communication loop:
- First step is special: exchange a single block with neighbor[0].
- Rest of the steps:
update recv_data_from according to offset, and
- Rest of the steps:
update recv_data_from according to offset, and
exchange two blocks with appropriate neighbor.
the send location becomes previous receve location.
*/
tmprecv = (char*)rbuf + (ptrdiff_t)neighbor[0] * (ptrdiff_t)rcount * rext;
tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
err = ompi_coll_base_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, rcount, rdtype, neighbor[0],
MCA_COLL_BASE_TAG_ALLGATHER,
@ -597,15 +578,15 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
for (i = 1; i < (size / 2); i++) {
const int i_parity = i % 2;
recv_data_from[i_parity] =
recv_data_from[i_parity] =
(recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
tmprecv = (char*)rbuf + (ptrdiff_t)recv_data_from[i_parity] * (ptrdiff_t)rcount * rext;
tmpsend = (char*)rbuf + (ptrdiff_t)send_data_from * rcount * rext;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
neighbor[i_parity],
err = ompi_coll_base_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
neighbor[i_parity],
MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype,
neighbor[i_parity],
@ -619,13 +600,13 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
int ompi_coll_base_allgather_intra_two_procs(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -638,8 +619,8 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_allgather_intra_two_procs rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_allgather_intra_two_procs rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -661,7 +642,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
}
tmprecv = (char*)rbuf + (ptrdiff_t)remote * (ptrdiff_t)rcount * rext;
err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote,
err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote,
MCA_COLL_BASE_TAG_ALLGATHER,
tmprecv, rcount, rdtype, remote,
MCA_COLL_BASE_TAG_ALLGATHER,
@ -670,7 +651,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
/* Place your data in correct location if necessary */
if (MPI_IN_PLACE != sbuf) {
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
@ -678,7 +659,7 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -687,13 +668,13 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
* have to duplicate code.
* JPG following the examples from other coll_tuned implementations. Dec06.
* JPG following the examples from other coll_base implementations. Dec06.
*/
/* copied function (with appropriate renaming) starts here */
@ -706,10 +687,10 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
ompi_coll_base_allgather_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf,
int rcount,
int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
@ -727,7 +708,7 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount,
sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount);
sdtype = rdtype;
scount = rcount;
}
}
/* Gather and broadcast. */
@ -755,183 +736,3 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount,
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLGATHER] = coll_tuned_allgather_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_count",
"Number of allgather algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_allgather_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allgather_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_allgather_algorithms", allgather_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm",
"Which allallgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_allgather_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_segmentsize",
"Segment size in bytes used by default for allgather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_segment_size);
coll_tuned_allgather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_tree_fanout",
"Fanout for n-tree used for allgather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_tree_fanout);
coll_tuned_allgather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_chain_fanout",
"Fanout for chains used for allgather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allgather_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_forced selected algorithm %d",
data->user_forced[ALLGATHER].algorithm));
switch (data->user_forced[ALLGATHER].algorithm) {
case (0):
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_allgather_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_allgather_intra_bruck (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (3):
return ompi_coll_tuned_allgather_intra_recursivedoubling (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (4):
return ompi_coll_tuned_allgather_intra_ring (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (5):
return ompi_coll_tuned_allgather_intra_neighborexchange (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (6):
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[ALLGATHER].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_allgather_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_allgather_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (3):
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (4):
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (5):
return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (6):
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,19 +30,12 @@
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/* allgatherv algorithm variables */
static int coll_tuned_allgatherv_algorithm_count = 5;
static int coll_tuned_allgatherv_forced_algorithm = 0;
static int coll_tuned_allgatherv_segment_size = 0;
static int coll_tuned_allgatherv_tree_fanout;
static int coll_tuned_allgatherv_chain_fanout;
/* valid values for coll_tuned_allgatherv_forced_algorithm */
static mca_base_var_enum_value_t allgatherv_algorithms[] = {
/* valid values for coll_base_allgatherv_forced_algorithm */
mca_base_var_enum_value_t coll_base_allgatherv_algorithms[] = {
{0, "ignore"},
{1, "default"},
{2, "bruck"},
@ -53,7 +46,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
};
/*
* ompi_coll_tuned_allgatherv_intra_bruck
* ompi_coll_base_allgatherv_intra_bruck
*
* Function: allgather using O(log(N)) steps.
* Accepts: Same arguments as MPI_Allgather
@ -64,7 +57,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
* in Multiport Message-Passing Systems"
* Note: Unlike in case of allgather implementation, we relay on
* indexed datatype to select buffers appropriately.
* The only additional memory requirement is for creation of
* The only additional memory requirement is for creation of
* temporary datatypes.
* Example on 7 nodes (memory lay out need not be in-order)
* Initial set up:
@ -86,7 +79,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
* [ ] [ ] [ ] [ ] [5] [5] [ ]
* [ ] [ ] [ ] [ ] [ ] [6] [6]
* Step 1: send message to (rank - 2^1), receive message from (rank + 2^1).
* message contains all blocks from (rank) .. (rank + 2^2) with
* message contains all blocks from (rank) .. (rank + 2^2) with
* wrap around.
* # 0 1 2 3 4 5 6
* [0] [ ] [ ] [ ] [0] [0] [0]
@ -97,7 +90,7 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
* [ ] [ ] [5] [5] [5] [5] [ ]
* [ ] [ ] [ ] [6] [6] [6] [6]
* Step 2: send message to (rank - 2^2), receive message from (rank + 2^2).
* message size is "all remaining blocks"
* message size is "all remaining blocks"
* # 0 1 2 3 4 5 6
* [0] [0] [0] [0] [0] [0] [0]
* [1] [1] [1] [1] [1] [1] [1]
@ -107,10 +100,10 @@ static mca_base_var_enum_value_t allgatherv_algorithms[] = {
* [5] [5] [5] [5] [5] [5] [5]
* [6] [6] [6] [6] [6] [6] [6]
*/
int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
int ompi_coll_base_allgatherv_intra_bruck(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *rdispls,
int *rdispls,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
@ -124,9 +117,9 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_bruck rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgather_intra_bruck rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -134,27 +127,27 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Initialization step:
- if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
- if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
the receive buffer.
*/
tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext;
if (MPI_IN_PLACE != sbuf) {
tmpsend = (char*) sbuf;
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
tmprecv, rcounts[rank], rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
/* Communication step:
At every step i, rank r:
- doubles the distance
- sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
to rank (r - distance)
- receives message of blockcount blocks,
(rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
- receives message of blockcount blocks,
(rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
rank (r + distance)
- blockcount doubles until the last step when only the remaining data is
- blockcount doubles until the last step when only the remaining data is
exchanged.
*/
blockcount = 1;
@ -173,7 +166,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
if (distance <= (size >> 1)) {
blockcount = distance;
} else {
} else {
blockcount = size - distance;
}
@ -186,7 +179,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
new_rcounts[i] = rcounts[tmp_rrank];
new_rdispls[i] = rdispls[tmp_rrank];
}
err = ompi_datatype_create_indexed(blockcount, new_scounts, new_sdispls,
err = ompi_datatype_create_indexed(blockcount, new_scounts, new_sdispls,
rdtype, &new_sdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
err = ompi_datatype_create_indexed(blockcount, new_rcounts, new_rdispls,
@ -198,7 +191,7 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(rbuf, 1, new_sdtype, sendto,
err = ompi_coll_base_sendrecv(rbuf, 1, new_sdtype, sendto,
MCA_COLL_BASE_TAG_ALLGATHERV,
rbuf, 1, new_rdtype, recvfrom,
MCA_COLL_BASE_TAG_ALLGATHERV,
@ -207,7 +200,6 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
ompi_datatype_destroy(&new_sdtype);
ompi_datatype_destroy(&new_rdtype);
}
free(new_rcounts);
@ -217,14 +209,14 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
err_hndl:
if( NULL != new_rcounts ) free(new_rcounts);
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* ompi_coll_tuned_allgatherv_intra_ring
* ompi_coll_base_allgatherv_intra_ring
*
* Function: allgatherv using O(N) steps.
* Accepts: Same arguments as MPI_Allgatherv
@ -236,9 +228,9 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
* (r + 1) containing data from rank (r - i), with wrap arounds.
* Memory requirements:
* No additional memory requirements.
*
*
*/
int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
int ompi_coll_base_allgatherv_intra_ring(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
@ -252,8 +244,8 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_ring rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgatherv_intra_ring rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -262,24 +254,24 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Initialization step:
- if send buffer is not MPI_IN_PLACE, copy send buffer to
- if send buffer is not MPI_IN_PLACE, copy send buffer to
the appropriate block of receive buffer
*/
tmprecv = (char*) rbuf + (ptrdiff_t)rdisps[rank] * rext;
if (MPI_IN_PLACE != sbuf) {
tmpsend = (char*) sbuf;
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
tmprecv, rcounts[rank], rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
/* Communication step:
At every step i: 0 .. (P-1), rank r:
- receives message from [(r - 1 + size) % size] containing data from rank
[(r - i - 1 + size) % size]
- sends message to rank [(r + 1) % size] containing data from rank
[(r - i + size) % size]
- sends message which starts at begining of rbuf and has size
- sends message which starts at begining of rbuf and has size
*/
sendto = (rank + 1) % size;
recvfrom = (rank - 1 + size) % size;
@ -292,47 +284,46 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
tmpsend = (char*)rbuf + rdisps[senddatafrom] * rext;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, rcounts[senddatafrom], rdtype,
err = ompi_coll_base_sendrecv(tmpsend, rcounts[senddatafrom], rdtype,
sendto, MCA_COLL_BASE_TAG_ALLGATHERV,
tmprecv, rcounts[recvdatafrom], rdtype,
tmprecv, rcounts[recvdatafrom], rdtype,
recvfrom, MCA_COLL_BASE_TAG_ALLGATHERV,
comm, MPI_STATUS_IGNORE, rank);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* ompi_coll_tuned_allgatherv_intra_neighborexchange
* ompi_coll_base_allgatherv_intra_neighborexchange
*
* Function: allgatherv using N/2 steps (O(N))
* Accepts: Same arguments as MPI_Allgatherv
* Returns: MPI_SUCCESS or error code
*
* Description: Neighbor Exchange algorithm for allgather adapted for
* Description: Neighbor Exchange algorithm for allgather adapted for
* allgatherv.
* Described by Chen et.al. in
* "Performance Evaluation of Allgather Algorithms on
* Described by Chen et.al. in
* "Performance Evaluation of Allgather Algorithms on
* Terascale Linux Cluster with Fast Ethernet",
* Proceedings of the Eighth International Conference on
* Proceedings of the Eighth International Conference on
* High-Performance Computing inn Asia-Pacific Region
* (HPCASIA'05), 2005
*
*
* Rank r exchanges message with one of its neighbors and
* forwards the data further in the next step.
*
* No additional memory requirements.
*
*
* Limitations: Algorithm works only on even number of processes.
* For odd number of processes we switch to ring algorithm.
*
*
* Example on 6 nodes:
* Initial state
* # 0 1 2 3 4 5
@ -367,8 +358,8 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
* [4] [4] [4] [4] [4] [4]
* [5] [5] [5] [5] [5] [5]
*/
int
ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
int
ompi_coll_base_allgatherv_intra_neighborexchange(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdispls,
struct ompi_datatype_t *rdtype,
@ -386,17 +377,17 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
if (size % 2) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgatherv_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm",
size));
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
rbuf, rcounts,
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
rbuf, rcounts,
rdispls, rdtype,
comm, module);
}
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_neighborexchange rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allgatherv_intra_neighborexchange rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -405,16 +396,16 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Initialization step:
- if send buffer is not MPI_IN_PLACE, copy send buffer to
- if send buffer is not MPI_IN_PLACE, copy send buffer to
the appropriate block of receive buffer
*/
tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext;
if (MPI_IN_PLACE != sbuf) {
tmpsend = (char*) sbuf;
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
tmprecv, rcounts[rank], rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
/* Determine neighbors, order in which blocks will arrive, etc. */
even_rank = !(rank % 2);
@ -436,8 +427,8 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
/* Communication loop:
- First step is special: exchange a single block with neighbor[0].
- Rest of the steps:
update recv_data_from according to offset, and
- Rest of the steps:
update recv_data_from according to offset, and
exchange two blocks with appropriate neighbor.
the send location becomes previous receve location.
Note, we need to create indexed datatype to send and receive these
@ -445,13 +436,13 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
*/
tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[neighbor[0]] * rext;
tmpsend = (char*)rbuf + (ptrdiff_t)rdispls[rank] * rext;
err = ompi_coll_tuned_sendrecv(tmpsend, rcounts[rank], rdtype,
err = ompi_coll_base_sendrecv(tmpsend, rcounts[rank], rdtype,
neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
tmprecv, rcounts[neighbor[0]], rdtype,
tmprecv, rcounts[neighbor[0]], rdtype,
neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
comm, MPI_STATUS_IGNORE, rank);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* Determine initial sending counts and displacements*/
if (even_rank) {
send_data_from = rank;
@ -461,7 +452,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
for (i = 1; i < (size / 2); i++) {
const int i_parity = i % 2;
recv_data_from[i_parity] =
recv_data_from[i_parity] =
(recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
/* Create new indexed types for sending and receiving.
@ -473,7 +464,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
new_scounts[1] = rcounts[(send_data_from + 1)];
new_sdispls[0] = rdispls[send_data_from];
new_sdispls[1] = rdispls[(send_data_from + 1)];
err = ompi_datatype_create_indexed(2, new_scounts, new_sdispls, rdtype,
err = ompi_datatype_create_indexed(2, new_scounts, new_sdispls, rdtype,
&new_sdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
err = ompi_datatype_commit(&new_sdtype);
@ -483,17 +474,17 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
new_rdispls[0] = rdispls[recv_data_from[i_parity]];
new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
err = ompi_datatype_create_indexed(2, new_rcounts, new_rdispls, rdtype,
err = ompi_datatype_create_indexed(2, new_rcounts, new_rdispls, rdtype,
&new_rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
err = ompi_datatype_commit(&new_rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
tmprecv = (char*)rbuf;
tmpsend = (char*)rbuf;
/* Sendreceive */
err = ompi_coll_tuned_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
err = ompi_coll_base_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
MCA_COLL_BASE_TAG_ALLGATHERV,
tmprecv, 1, new_rdtype, neighbor[i_parity],
MCA_COLL_BASE_TAG_ALLGATHERV,
@ -501,7 +492,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
send_data_from = recv_data_from[i_parity];
ompi_datatype_destroy(&new_sdtype);
ompi_datatype_destroy(&new_rdtype);
}
@ -509,13 +500,13 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
int ompi_coll_base_allgatherv_intra_two_procs(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts,
int *rdispls,
@ -529,8 +520,8 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_allgatherv_intra_two_procs rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_allgatherv_intra_two_procs rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
@ -552,7 +543,7 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
}
tmprecv = (char*)rbuf + (ptrdiff_t)rdispls[remote] * rext;
err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote,
err = ompi_coll_base_sendrecv(tmpsend, scount, sdtype, remote,
MCA_COLL_BASE_TAG_ALLGATHERV,
tmprecv, rcounts[remote], rdtype, remote,
MCA_COLL_BASE_TAG_ALLGATHERV,
@ -561,16 +552,16 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
/* Place your data in correct location if necessary */
if (MPI_IN_PLACE != sbuf) {
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
(char*)rbuf + (ptrdiff_t)rdispls[rank] * rext,
rcounts[rank], rdtype);
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
(char*)rbuf + (ptrdiff_t)rdispls[rank] * rext,
rcounts[rank], rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -579,13 +570,13 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
* have to duplicate code.
* JPG following the examples from other coll_tuned implementations. Dec06.
* JPG following the examples from other coll_base implementations. Dec06.
*/
/* copied function (with appropriate renaming) starts here */
@ -593,19 +584,19 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
/*
* allgatherv_intra_basic
*
* Function: - allgatherv using other MPI collectives:
* Function: - allgatherv using other MPI collectives:
* gatherv + bcast (from basic module).
* Accepts: - same as MPI_Allgatherv()
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *disps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
ompi_coll_base_allgatherv_intra_basic_default(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *disps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, size, rank, err;
MPI_Aint extent, lb;
@ -619,8 +610,8 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
* to process with rank 0 (OMPI convention)
*/
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_allgatherv_intra_basic_default rank %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_allgatherv_intra_basic_default rank %d",
rank));
if (MPI_IN_PLACE == sbuf) {
@ -639,7 +630,6 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
rcounts[rank], send_type,rbuf,
rcounts, disps, rdtype, 0,
comm, comm->c_coll.coll_gatherv_module);
if (MPI_SUCCESS != err) {
return err;
}
@ -648,7 +638,7 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
* broadcast the data out to the other processes
*
* Need to define a datatype that captures the different vectors
* from each process. MPI_TYPE_INDEXED with params
* from each process. MPI_TYPE_INDEXED with params
* size,rcount,displs,rdtype,newtype
* should do the trick.
* Use underlying ddt functions to create, and commit the
@ -660,7 +650,7 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
if (MPI_SUCCESS != err) {
return err;
}
err = ompi_datatype_commit(&newtype);
if(MPI_SUCCESS != err) {
return err;
@ -675,178 +665,3 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV] = coll_tuned_allgatherv_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_count",
"Number of allgatherv algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_allgatherv_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allgatherv_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_allgatherv_algorithms", allgatherv_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm",
"Which allallgatherv algorithm is used. Can be locked down to choice of: 0 ignore, 1 default (allgathervv + bcast), 2 bruck, 3 ring, 4 neighbor exchange, 5: two proc only.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_allgatherv_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_segmentsize",
"Segment size in bytes used by default for allgatherv algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_segment_size);
coll_tuned_allgatherv_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_tree_fanout",
"Fanout for n-tree used for allgatherv algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_tree_fanout);
coll_tuned_allgatherv_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_chain_fanout",
"Fanout for chains used for allgatherv algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *rdispls,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_forced selected algorithm %d",
data->user_forced[ALLGATHERV].algorithm));
switch (data->user_forced[ALLGATHERV].algorithm) {
case (0):
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_allgatherv_intra_basic_default (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_allgatherv_intra_bruck (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (3):
return ompi_coll_tuned_allgatherv_intra_ring (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (4):
return ompi_coll_tuned_allgatherv_intra_neighborexchange (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (5):
return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[ALLGATHERV].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *rdispls,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout,
int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_allgatherv_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_allgatherv_intra_basic_default(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (3):
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (4):
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (5):
return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -31,41 +31,23 @@
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* allreduce algorithm variables */
static int coll_tuned_allreduce_algorithm_count = 5;
static int coll_tuned_allreduce_forced_algorithm = 0;
static int coll_tuned_allreduce_segment_size = 0;
static int coll_tuned_allreduce_tree_fanout;
static int coll_tuned_allreduce_chain_fanout;
/* valid values for coll_tuned_allreduce_forced_algorithm */
static mca_base_var_enum_value_t allreduce_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "nonoverlapping"},
{3, "recursive_doubling"},
{4, "ring"},
{5, "segmented_ring"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/*
* ompi_coll_tuned_allreduce_intra_nonoverlapping
* ompi_coll_base_allreduce_intra_nonoverlapping
*
* This function just calls a reduce followed by a broadcast
* both called functions are tuned but they complete sequentially,
* both called functions are base but they complete sequentially,
* i.e. no additional overlapping
* meaning if the number of segments used is greater than the topo depth
* meaning if the number of segments used is greater than the topo depth
* then once the first segment of data is fully 'reduced' it is not broadcast
* while the reduce continues (cost = cost-reduce + cost-bcast + decision x 3)
*
*/
int
ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count,
ompi_coll_base_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
@ -75,16 +57,16 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_nonoverlapping rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_nonoverlapping rank %d", rank));
/* Reduce to 0 and broadcast. */
if (MPI_IN_PLACE == sbuf) {
if (0 == rank) {
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, rbuf, count, dtype,
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, rbuf, count, dtype,
op, 0, comm, comm->c_coll.coll_reduce_module);
} else {
err = comm->c_coll.coll_reduce (rbuf, NULL, count, dtype, op, 0,
err = comm->c_coll.coll_reduce (rbuf, NULL, count, dtype, op, 0,
comm, comm->c_coll.coll_reduce_module);
}
} else {
@ -100,21 +82,21 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
}
/*
* ompi_coll_tuned_allreduce_intra_recursivedoubling
* ompi_coll_base_allreduce_intra_recursivedoubling
*
* Function: Recursive doubling algorithm for allreduce operation
* Accepts: Same as MPI_Allreduce()
* Returns: MPI_SUCCESS or error code
*
* Description: Implements recursive doubling algorithm for allreduce.
* Original (non-segmented) implementation is used in MPICH-2
* Description: Implements recursive doubling algorithm for allreduce.
* Original (non-segmented) implementation is used in MPICH-2
* for small and intermediate size messages.
* The algorithm preserves order of operations so it can
* The algorithm preserves order of operations so it can
* be used both by commutative and non-commutative operations.
*
* Example on 7 nodes:
* Initial state
* # 0 1 2 3 4 5 6
* # 0 1 2 3 4 5 6
* [0] [1] [2] [3] [4] [5] [6]
* Initial adjustment step for non-power of two nodes.
* old rank 1 3 5 6
@ -129,24 +111,24 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
* old rank 1 3 5 6
* new rank 0 1 2 3
* [0+1+] [0+1+] [0+1+] [0+1+]
* [2+3+] [2+3+] [2+3+] [2+3+]
* [2+3+] [2+3+] [2+3+] [2+3+]
* [4+5+] [4+5+] [4+5+] [4+5+]
* [6 ] [6 ] [6 ] [6 ]
* Final adjustment step for non-power of two nodes
* # 0 1 2 3 4 5 6
* # 0 1 2 3 4 5 6
* [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+]
* [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+]
* [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+]
* [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+]
* [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ]
*
*/
int
ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
int
ompi_coll_base_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
mca_coll_base_module_t *module)
{
int ret, line, rank, size, adjsize, remote, distance;
int newrank, newremote, extra_ranks;
@ -157,9 +139,9 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allreduce_intra_recursivedoubling rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allreduce_intra_recursivedoubling rank %d", rank));
/* Special case for size == 1 */
if (1 == size) {
if (MPI_IN_PLACE != sbuf) {
@ -194,16 +176,16 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
adjsize >>= 1;
/* Handle non-power-of-two case:
- Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
- Even ranks less than 2 * extra_ranks send their data to (rank + 1), and
sets new rank to -1.
- Odd ranks less than 2 * extra_ranks receive data from (rank - 1),
- Odd ranks less than 2 * extra_ranks receive data from (rank - 1),
apply appropriate operation, and set new rank to rank/2
- Everyone else sets rank to rank - extra_ranks
*/
extra_ranks = size - adjsize;
if (rank < (2 * extra_ranks)) {
if (0 == (rank % 2)) {
ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1),
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
@ -221,7 +203,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
newrank = rank - extra_ranks;
}
/* Communication/Computation loop
/* Communication/Computation loop
- Exchange message with remote node.
- Perform appropriate operation taking in account order of operations:
result = value (op) result
@ -230,14 +212,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
if (newrank < 0) break;
/* Determine remote node */
newremote = newrank ^ distance;
remote = (newremote < extra_ranks)?
remote = (newremote < extra_ranks)?
(newremote * 2 + 1):(newremote + extra_ranks);
/* Exchange the data */
ret = MCA_PML_CALL(irecv(tmprecv, count, dtype, remote,
MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[0]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote,
ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote,
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
@ -258,14 +240,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
}
/* Handle non-power-of-two case:
- Odd ranks less than 2 * extra_ranks send result from tmpsend to
- Odd ranks less than 2 * extra_ranks send result from tmpsend to
(rank - 1)
- Even ranks less than 2 * extra_ranks receive result from (rank + 1)
*/
if (rank < (2 * extra_ranks)) {
if (0 == (rank % 2)) {
ret = MCA_PML_CALL(recv(rbuf, count, dtype, (rank + 1),
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
MPI_STATUS_IGNORE));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
tmpsend = (char*)rbuf;
@ -287,14 +269,14 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
__FILE__, line, rank, ret));
if (NULL != inplacebuf) free(inplacebuf);
return ret;
}
/*
* ompi_coll_tuned_allreduce_intra_ring
* ompi_coll_base_allreduce_intra_ring
*
* Function: Ring algorithm for allreduce operation
* Accepts: Same as MPI_Allreduce()
@ -304,9 +286,9 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* automatically segmented to segment of size M/N.
* Algorithm requires 2*N - 1 steps.
*
* Limitations: The algorithm DOES NOT preserve order of operations so it
* Limitations: The algorithm DOES NOT preserve order of operations so it
* can be used only for commutative operations.
* In addition, algorithm cannot work if the total count is
* In addition, algorithm cannot work if the total count is
* less than size.
* Example on 5 nodes:
* Initial state
@ -318,7 +300,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [04] [14] [24] [34] [44]
*
* COMPUTATION PHASE
* Step 0: rank r sends block r to rank (r+1) and receives bloc (r-1)
* Step 0: rank r sends block r to rank (r+1) and receives bloc (r-1)
* from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [00+10] [20] [30] [40]
@ -327,7 +309,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [03] [13] [23] [33] [33+43]
* [44+04] [14] [24] [34] [44]
*
* Step 1: rank r sends block (r-1) to rank (r+1) and receives bloc
* Step 1: rank r sends block (r-1) to rank (r+1) and receives bloc
* (r-2) from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [00+10] [01+10+20] [30] [40]
@ -336,7 +318,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [33+43+03] [13] [23] [33] [33+43]
* [44+04] [44+04+14] [24] [34] [44]
*
* Step 2: rank r sends block (r-2) to rank (r+1) and receives bloc
* Step 2: rank r sends block (r-2) to rank (r+1) and receives bloc
* (r-2) from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [00+10] [01+10+20] [01+10+20+30] [40]
@ -345,7 +327,7 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [33+43+03] [33+43+03+13] [23] [33] [33+43]
* [44+04] [44+04+14] [44+04+14+24] [34] [44]
*
* Step 3: rank r sends block (r-3) to rank (r+1) and receives bloc
* Step 3: rank r sends block (r-3) to rank (r+1) and receives bloc
* (r-3) from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [00+10] [01+10+20] [01+10+20+30] [FULL]
@ -353,16 +335,16 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf,
* [22+32+42+02] [FULL] [22] [22+32] [22+32+42]
* [33+43+03] [33+43+03+13] [FULL] [33] [33+43]
* [44+04] [44+04+14] [44+04+14+24] [FULL] [44]
*
*
* DISTRIBUTION PHASE: ring ALLGATHER with ranks shifted by 1.
*
*/
int
ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
int
ompi_coll_base_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
mca_coll_base_module_t *module)
{
int ret, line, rank, size, k, recv_from, send_to, block_count, inbi;
int early_segcount, late_segcount, split_rank, max_segcount;
@ -375,9 +357,9 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allreduce_intra_ring rank %d, count %d", rank, count));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allreduce_intra_ring rank %d, count %d", rank, count));
/* Special case for size == 1 */
if (1 == size) {
if (MPI_IN_PLACE != sbuf) {
@ -389,10 +371,10 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
/* Special case for count less than size - use recursive doubling */
if (count < size) {
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count));
return (ompi_coll_tuned_allreduce_intra_recursivedoubling(sbuf, rbuf,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count));
return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
count,
dtype, op,
dtype, op,
comm, module));
}
@ -404,14 +386,14 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
ret = ompi_datatype_type_size( dtype, &typelng);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Determine the number of elements per block and corresponding
/* Determine the number of elements per block and corresponding
block sizes.
The blocks are divided into "early" and "late" ones:
blocks 0 .. (split_rank - 1) are "early" and
blocks 0 .. (split_rank - 1) are "early" and
blocks (split_rank) .. (size - 1) are "late".
Early blocks are at most 1 element larger than the late ones.
*/
COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank,
COLL_BASE_COMPUTE_BLOCKCOUNT( count, size, split_rank,
early_segcount, late_segcount );
max_segcount = early_segcount;
max_real_segsize = true_extent + (max_segcount - 1) * extent;
@ -432,7 +414,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
/* Computation loop */
/*
/*
For each of the remote nodes:
- post irecv for block (r-1)
- send block (r)
@ -456,8 +438,8 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Send first block (my block) to the neighbor on the right */
block_offset = ((rank < split_rank)?
((ptrdiff_t)rank * (ptrdiff_t)early_segcount) :
block_offset = ((rank < split_rank)?
((ptrdiff_t)rank * (ptrdiff_t)early_segcount) :
((ptrdiff_t)rank * (ptrdiff_t)late_segcount + split_rank));
block_count = ((rank < split_rank)? early_segcount : late_segcount);
tmpsend = ((char*)rbuf) + block_offset * extent;
@ -465,21 +447,21 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
for (k = 2; k < size; k++) {
const int prevblock = (rank + size - k + 1) % size;
inbi = inbi ^ 0x1;
/* Post irecv for the current block */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from,
MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Wait on previous block to arrive */
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on previous block: result goes to rbuf
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
*/
@ -489,7 +471,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
block_count = ((prevblock < split_rank)? early_segcount : late_segcount);
tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
/* send previous block to send_to */
ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
@ -501,7 +483,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on the last block (from neighbor (rank + 1)
/* Apply operation on the last block (from neighbor (rank + 1)
rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
recv_from = (rank + 1) % size;
block_offset = ((recv_from < split_rank)?
@ -510,28 +492,28 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
block_count = ((recv_from < split_rank)? early_segcount : late_segcount);
tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype);
/* Distribution loop - variation of ring allgather */
send_to = (rank + 1) % size;
recv_from = (rank + size - 1) % size;
for (k = 0; k < size - 1; k++) {
const int recv_data_from = (rank + size - k) % size;
const int send_data_from = (rank + 1 + size - k) % size;
const int send_block_offset =
const int send_block_offset =
((send_data_from < split_rank)?
((ptrdiff_t)send_data_from * early_segcount) :
((ptrdiff_t)send_data_from * late_segcount + split_rank));
const int recv_block_offset =
const int recv_block_offset =
((recv_data_from < split_rank)?
((ptrdiff_t)recv_data_from * early_segcount) :
((ptrdiff_t)recv_data_from * late_segcount + split_rank));
block_count = ((send_data_from < split_rank)?
block_count = ((send_data_from < split_rank)?
early_segcount : late_segcount);
tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent;
tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent;
ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to,
ret = ompi_coll_base_sendrecv(tmpsend, block_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
tmprecv, max_segcount, dtype, recv_from,
MCA_COLL_BASE_TAG_ALLREDUCE,
@ -546,7 +528,7 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
__FILE__, line, rank, ret));
if (NULL != inbuf[0]) free(inbuf[0]);
if (NULL != inbuf[1]) free(inbuf[1]);
@ -554,30 +536,30 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
}
/*
* ompi_coll_tuned_allreduce_intra_ring_segmented
* ompi_coll_base_allreduce_intra_ring_segmented
*
* Function: Pipelined ring algorithm for allreduce operation
* Accepts: Same as MPI_Allreduce(), segment size
* Returns: MPI_SUCCESS or error code
*
* Description: Implements pipelined ring algorithm for allreduce:
* Description: Implements pipelined ring algorithm for allreduce:
* user supplies suggested segment size for the pipelining of
* reduce operation.
* The segment size determines the number of phases, np, for
* the algorithm execution.
* The message is automatically divided into blocks of
* The segment size determines the number of phases, np, for
* the algorithm execution.
* The message is automatically divided into blocks of
* approximately (count / (np * segcount)) elements.
* At the end of reduction phase, allgather like step is
* At the end of reduction phase, allgather like step is
* executed.
* Algorithm requires (np + 1)*(N - 1) steps.
*
* Limitations: The algorithm DOES NOT preserve order of operations so it
* Limitations: The algorithm DOES NOT preserve order of operations so it
* can be used only for commutative operations.
* In addition, algorithm cannot work if the total size is
* In addition, algorithm cannot work if the total size is
* less than size * segment size.
* Example on 3 nodes with 2 phases
* Initial state
* # 0 1 2
* # 0 1 2
* [00a] [10a] [20a]
* [00b] [10b] [20b]
* [01a] [11a] [21a]
@ -586,9 +568,9 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
* [02b] [12b] [22b]
*
* COMPUTATION PHASE 0 (a)
* Step 0: rank r sends block ra to rank (r+1) and receives bloc (r-1)a
* Step 0: rank r sends block ra to rank (r+1) and receives bloc (r-1)a
* from rank (r-1) [with wraparound].
* # 0 1 2
* # 0 1 2
* [00a] [00a+10a] [20a]
* [00b] [10b] [20b]
* [01a] [11a] [11a+21a]
@ -596,20 +578,20 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
* [22a+02a] [12a] [22a]
* [02b] [12b] [22b]
*
* Step 1: rank r sends block (r-1)a to rank (r+1) and receives bloc
* Step 1: rank r sends block (r-1)a to rank (r+1) and receives bloc
* (r-2)a from rank (r-1) [with wraparound].
* # 0 1 2
* # 0 1 2
* [00a] [00a+10a] [00a+10a+20a]
* [00b] [10b] [20b]
* [11a+21a+01a] [11a] [11a+21a]
* [01b] [11b] [21b]
* [22a+02a] [22a+02a+12a] [22a]
* [02b] [12b] [22b]
* [02b] [12b] [22b]
*
* COMPUTATION PHASE 1 (b)
* Step 0: rank r sends block rb to rank (r+1) and receives bloc (r-1)b
* Step 0: rank r sends block rb to rank (r+1) and receives bloc (r-1)b
* from rank (r-1) [with wraparound].
* # 0 1 2
* # 0 1 2
* [00a] [00a+10a] [20a]
* [00b] [00b+10b] [20b]
* [01a] [11a] [11a+21a]
@ -617,31 +599,31 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count,
* [22a+02a] [12a] [22a]
* [22b+02b] [12b] [22b]
*
* Step 1: rank r sends block (r-1)b to rank (r+1) and receives bloc
* Step 1: rank r sends block (r-1)b to rank (r+1) and receives bloc
* (r-2)b from rank (r-1) [with wraparound].
* # 0 1 2
* # 0 1 2
* [00a] [00a+10a] [00a+10a+20a]
* [00b] [10b] [0bb+10b+20b]
* [11a+21a+01a] [11a] [11a+21a]
* [11b+21b+01b] [11b] [21b]
* [22a+02a] [22a+02a+12a] [22a]
* [02b] [22b+01b+12b] [22b]
* [02b] [22b+01b+12b] [22b]
*
*
*
* DISTRIBUTION PHASE: ring ALLGATHER with ranks shifted by 1 (same as
* in regular ring algorithm.
*
*/
int
ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count,
int
ompi_coll_base_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
uint32_t segsize)
uint32_t segsize)
{
int ret, line, rank, size, k, recv_from, send_to;
int early_blockcount, late_blockcount, split_rank;
int early_blockcount, late_blockcount, split_rank;
int segcount, max_segcount, num_phases, phase, block_count, inbi;
size_t typelng;
char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
@ -652,9 +634,9 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:allreduce_intra_ring_segmented rank %d, count %d", rank, count));
/* Special case for size == 1 */
if (1 == size) {
if (MPI_IN_PLACE != sbuf) {
@ -672,34 +654,34 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
ret = ompi_datatype_type_size( dtype, &typelng);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
segcount = count;
COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount)
COLL_BASE_COMPUTED_SEGCOUNT(segsize, typelng, segcount)
/* Special case for count less than size * segcount - use regular ring */
if (count < (size * segcount)) {
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count));
return (ompi_coll_tuned_allreduce_intra_ring(sbuf, rbuf, count, dtype, op,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count));
return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op,
comm, module));
}
/* Determine the number of phases of the algorithm */
num_phases = count / (size * segcount);
if ((count % (size * segcount) >= size) &&
if ((count % (size * segcount) >= size) &&
(count % (size * segcount) > ((size * segcount) / 2))) {
num_phases++;
}
/* Determine the number of elements per block and corresponding
/* Determine the number of elements per block and corresponding
block sizes.
The blocks are divided into "early" and "late" ones:
blocks 0 .. (split_rank - 1) are "early" and
blocks 0 .. (split_rank - 1) are "early" and
blocks (split_rank) .. (size - 1) are "late".
Early blocks are at most 1 element larger than the late ones.
Note, these blocks will be split into num_phases segments,
out of the largest one will have max_segcount elements.
*/
COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank,
COLL_BASE_COMPUTE_BLOCKCOUNT( count, size, split_rank,
early_blockcount, late_blockcount );
COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
COLL_BASE_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
max_segcount, k);
max_real_segsize = true_extent + (ptrdiff_t)(max_segcount - 1) * extent;
@ -722,7 +704,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
ptrdiff_t phase_offset;
int early_phase_segcount, late_phase_segcount, split_phase, phase_count;
/*
/*
For each of the remote nodes:
- post irecv for block (r-1)
- send block (r)
@ -741,7 +723,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
*/
send_to = (rank + 1) % size;
recv_from = (rank + size - 1) % size;
inbi = 0;
/* Initialize first receive from the neighbor on the left */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from,
@ -750,81 +732,81 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
/* Send first block (my block) to the neighbor on the right:
- compute my block and phase offset
- send data */
block_offset = ((rank < split_rank)?
((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) :
block_offset = ((rank < split_rank)?
((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)rank * (ptrdiff_t)late_blockcount + split_rank));
block_count = ((rank < split_rank)? early_blockcount : late_blockcount);
COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
early_phase_segcount, late_phase_segcount)
phase_count = ((phase < split_phase)?
(early_phase_segcount) : (late_phase_segcount));
phase_offset = ((phase < split_phase)?
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
tmpsend = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
ret = MCA_PML_CALL(send(tmpsend, phase_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
for (k = 2; k < size; k++) {
const int prevblock = (rank + size - k + 1) % size;
inbi = inbi ^ 0x1;
/* Post irecv for the current block */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from,
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
&reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Wait on previous block to arrive */
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on previous block: result goes to rbuf
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
*/
block_offset = ((prevblock < split_rank)?
((ptrdiff_t)prevblock * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)prevblock * (ptrdiff_t)late_blockcount + split_rank));
block_count = ((prevblock < split_rank)?
block_count = ((prevblock < split_rank)?
early_blockcount : late_blockcount);
COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
early_phase_segcount, late_phase_segcount)
phase_count = ((phase < split_phase)?
(early_phase_segcount) : (late_phase_segcount));
phase_offset = ((phase < split_phase)?
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype);
/* send previous block to send_to */
ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
}
/* Wait on the last block to arrive */
ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on the last block (from neighbor (rank + 1)
/* Apply operation on the last block (from neighbor (rank + 1)
rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
recv_from = (rank + 1) % size;
block_offset = ((recv_from < split_rank)?
((ptrdiff_t)recv_from * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)recv_from * (ptrdiff_t)late_blockcount + split_rank));
block_count = ((recv_from < split_rank)?
block_count = ((recv_from < split_rank)?
early_blockcount : late_blockcount);
COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
early_phase_segcount, late_phase_segcount)
phase_count = ((phase < split_phase)?
(early_phase_segcount) : (late_phase_segcount));
phase_offset = ((phase < split_phase)?
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype);
@ -836,21 +818,21 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
for (k = 0; k < size - 1; k++) {
const int recv_data_from = (rank + size - k) % size;
const int send_data_from = (rank + 1 + size - k) % size;
const int send_block_offset =
const int send_block_offset =
((send_data_from < split_rank)?
((ptrdiff_t)send_data_from * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)send_data_from * (ptrdiff_t)late_blockcount + split_rank));
const int recv_block_offset =
const int recv_block_offset =
((recv_data_from < split_rank)?
((ptrdiff_t)recv_data_from * (ptrdiff_t)early_blockcount) :
((ptrdiff_t)recv_data_from * (ptrdiff_t)late_blockcount + split_rank));
block_count = ((send_data_from < split_rank)?
block_count = ((send_data_from < split_rank)?
early_blockcount : late_blockcount);
tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent;
tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent;
ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to,
ret = ompi_coll_base_sendrecv(tmpsend, block_count, dtype, send_to,
MCA_COLL_BASE_TAG_ALLREDUCE,
tmprecv, early_blockcount, dtype, recv_from,
MCA_COLL_BASE_TAG_ALLREDUCE,
@ -865,7 +847,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
__FILE__, line, rank, ret));
if (NULL != inbuf[0]) free(inbuf[0]);
if (NULL != inbuf[1]) free(inbuf[1]);
@ -875,8 +857,8 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
@ -895,7 +877,7 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
ompi_coll_base_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
@ -905,158 +887,28 @@ ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_basic_linear rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:allreduce_intra_basic_linear rank %d", rank));
/* Reduce to 0 and broadcast. */
if (MPI_IN_PLACE == sbuf) {
if (0 == rank) {
err = ompi_coll_tuned_reduce_intra_basic_linear (MPI_IN_PLACE, rbuf, count, dtype,
err = ompi_coll_base_reduce_intra_basic_linear (MPI_IN_PLACE, rbuf, count, dtype,
op, 0, comm, module);
} else {
err = ompi_coll_tuned_reduce_intra_basic_linear(rbuf, NULL, count, dtype,
err = ompi_coll_base_reduce_intra_basic_linear(rbuf, NULL, count, dtype,
op, 0, comm, module);
}
} else {
err = ompi_coll_tuned_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
err = ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
op, 0, comm, module);
}
if (MPI_SUCCESS != err) {
return err;
}
return ompi_coll_tuned_bcast_intra_basic_linear(rbuf, count, dtype, 0, comm, module);
return ompi_coll_base_bcast_intra_basic_linear(rbuf, count, dtype, 0, comm, module);
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = coll_tuned_allreduce_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_count",
"Number of allreduce algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_allreduce_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allreduce_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_allreduce_algorithms", allreduce_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm",
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_allreduce_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_segmentsize",
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_segment_size);
coll_tuned_allreduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_tree_fanout",
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_tree_fanout);
coll_tuned_allreduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_chain_fanout",
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d, segment size %d",
data->user_forced[ALLREDUCE].algorithm,
data->user_forced[ALLREDUCE].segsize));
switch (data->user_forced[ALLREDUCE].algorithm) {
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm, module);
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm, module);
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm, module);
case (3): return ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, count, dtype, op, comm, module);
case (4): return ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, op, comm, module);
case (5): return ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, count, dtype, op, comm, module, data->user_forced[ALLREDUCE].segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[ALLREDUCE].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm, module);
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm, module);
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm, module);
case (3): return ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf, count, dtype, op, comm, module);
case (4): return ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype, op, comm, module);
case (5): return ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf, count, dtype, op, comm, module, segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,37 +30,18 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* alltoall algorithm variables */
static int coll_tuned_alltoall_algorithm_count = 5;
static int coll_tuned_alltoall_forced_algorithm = 0;
static int coll_tuned_alltoall_segment_size = 0;
static int coll_tuned_alltoall_max_requests;
static int coll_tuned_alltoall_tree_fanout;
static int coll_tuned_alltoall_chain_fanout;
/* valid values for coll_tuned_alltoall_forced_algorithm */
static mca_base_var_enum_value_t alltoall_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "pairwise"},
{3, "modified_bruck"},
{4, "linear_sync"},
{5, "two_proc"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */
static int
mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
int
mca_coll_base_alltoall_intra_basic_inplace(void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
int i, j, size, rank, err=MPI_SUCCESS;
MPI_Request *preq;
char *tmp_buffer;
@ -91,7 +72,7 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
for (i = 0 ; i < size ; ++i) {
for (j = i+1 ; j < size ; ++j) {
/* Initiate all send/recv to/from others. */
preq = tuned_module->tuned_data->mcct_reqs;
preq = coll_base_comm_get_reqs(base_module->base_data, size * 2);
if (i == rank) {
/* Copy the data into the temporary buffer */
@ -128,11 +109,8 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
}
/* Wait for the requests to complete */
err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE);
err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Free the requests. */
mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2);
}
}
@ -145,7 +123,7 @@ mca_coll_tuned_alltoall_intra_basic_inplace(void *rbuf, int rcount,
return err;
}
int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
int ompi_coll_base_alltoall_intra_pairwise(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -157,22 +135,22 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
ptrdiff_t lb, sext, rext;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoall_intra_pairwise rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:alltoall_intra_pairwise rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
err = ompi_datatype_get_extent (rdtype, &lb, &rext);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* Perform pairwise exchange - starting from 1 so the local copy is last */
for (step = 1; step < size + 1; step++) {
@ -185,51 +163,47 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
tmprecv = (char*)rbuf + (ptrdiff_t)recvfrom * rext * (ptrdiff_t)rcount;
/* send and receive */
err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto,
err = ompi_coll_base_sendrecv( tmpsend, scount, sdtype, sendto,
MCA_COLL_BASE_TAG_ALLTOALL,
tmprecv, rcount, rdtype, recvfrom,
tmprecv, rcount, rdtype, recvfrom,
MCA_COLL_BASE_TAG_ALLTOALL,
comm, MPI_STATUS_IGNORE, rank);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line,
err, rank));
return err;
}
int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
int ompi_coll_base_alltoall_intra_bruck(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, k, line = -1, rank, size, err = 0, weallocated = 0;
int i, k, line = -1, rank, size, err = 0;
int sendto, recvfrom, distance, *displs = NULL, *blen = NULL;
char *tmpbuf = NULL, *tmpbuf_free = NULL;
ptrdiff_t rlb, slb, tlb, sext, rext, tsext;
struct ompi_datatype_t *new_ddt;
#ifdef blahblah
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
#endif
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoall_intra_bruck rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:alltoall_intra_bruck rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
@ -241,25 +215,10 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
#ifdef blahblah
/* try and SAVE memory by using the data segment hung off
the communicator if possible */
if (data->mcct_num_reqs >= size) {
/* we have enought preallocated for displments and lengths */
displs = (int*) data->mcct_reqs;
blen = (int *) (displs + size);
weallocated = 0;
}
else { /* allocate the buffers ourself */
#endif
displs = (int *) malloc(size * sizeof(int));
if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
blen = (int *) malloc(size * sizeof(int));
if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }
weallocated = 1;
#ifdef blahblah
}
#endif
displs = (int *) malloc(size * sizeof(int));
if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
blen = (int *) malloc(size * sizeof(int));
if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }
/* tmp buffer allocation for message data */
tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext);
@ -267,9 +226,9 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
tmpbuf = tmpbuf_free - slb;
/* Step 1 - local rotation - shift up by rank */
err = ompi_datatype_copy_content_same_ddt (sdtype,
err = ompi_datatype_copy_content_same_ddt (sdtype,
(int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount),
tmpbuf,
tmpbuf,
((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext);
if (err<0) {
line = __LINE__; err = -1; goto err_hndl;
@ -277,7 +236,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
if (rank != 0) {
err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount,
tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
(char*) sbuf);
if (err<0) {
line = __LINE__; err = -1; goto err_hndl;
@ -294,7 +253,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
/* create indexed datatype */
for (i = 1; i < size; i++) {
if (( i & distance) == distance) {
displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount;
displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount;
blen[k] = scount;
k++;
}
@ -307,7 +266,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* Sendreceive */
err = ompi_coll_tuned_sendrecv ( tmpbuf, 1, new_ddt, sendto,
err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto,
MCA_COLL_BASE_TAG_ALLTOALL,
rbuf, 1, new_ddt, recvfrom,
MCA_COLL_BASE_TAG_ALLTOALL,
@ -327,22 +286,20 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
for (i = 0; i < size; i++) {
err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount,
((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext);
if (err < 0) { line = __LINE__; err = -1; goto err_hndl; }
}
/* Step 4 - clean up */
if (tmpbuf != NULL) free(tmpbuf_free);
if (weallocated) {
if (displs != NULL) free(displs);
if (blen != NULL) free(blen);
}
if (displs != NULL) free(displs);
if (blen != NULL) free(blen);
return OMPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
rank));
if (tmpbuf != NULL) free(tmpbuf_free);
if (displs != NULL) free(displs);
@ -352,10 +309,10 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
/*
* alltoall_intra_linear_sync
*
*
* Function: Linear implementation of alltoall with limited number
* of outstanding requests.
* Accepts: Same as MPI_Alltoall(), and the maximum number of
* Accepts: Same as MPI_Alltoall(), and the maximum number of
* outstanding requests (actual number is 2 * max, since
* we count receive and send requests separately).
* Returns: MPI_SUCCESS or error code
@ -367,7 +324,7 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
* - wait for any request to complete
* - replace that request by the new one of the same type.
*/
int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
int ompi_coll_base_alltoall_intra_linear_sync(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -382,7 +339,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
ompi_request_t **reqs = NULL;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
@ -391,8 +348,8 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_alltoall_intra_linear_sync rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_alltoall_intra_linear_sync rank %d", rank));
error = ompi_datatype_get_extent(sdtype, &slb, &sext);
if (OMPI_SUCCESS != error) {
@ -423,18 +380,18 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
}
/* Initiate send/recv to/from others. */
total_reqs = (((max_outstanding_reqs > (size - 1)) ||
total_reqs = (((max_outstanding_reqs > (size - 1)) ||
(max_outstanding_reqs <= 0)) ?
(size - 1) : (max_outstanding_reqs));
reqs = (ompi_request_t**) malloc( 2 * total_reqs *
reqs = (ompi_request_t**) malloc( 2 * total_reqs *
sizeof(ompi_request_t*));
if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }
prcv = (char *) rbuf;
psnd = (char *) sbuf;
/* Post first batch or ireceive and isend requests */
for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
ri = (ri + 1) % size, ++nreqs, ++nrreqs) {
error =
MCA_PML_CALL(irecv
@ -442,7 +399,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs]));
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
}
for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
si = (si + size - 1) % size, ++nreqs, ++nsreqs) {
error =
MCA_PML_CALL(isend
@ -457,12 +414,12 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
/* Optimization for the case when all requests have been posted */
error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
} else {
/* As requests complete, replace them with corresponding requests:
- wait for any request to complete, mark the request as
- wait for any request to complete, mark the request as
MPI_REQUEST_NULL
- If it was a receive request, replace it with new irecv request
- If it was a receive request, replace it with new irecv request
(if any)
- if it was a send request, replace it with new isend request (if any)
*/
@ -476,10 +433,10 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
ncreqs++;
if (completed < total_reqs) {
if (nrreqs < (size - 1)) {
error =
error =
MCA_PML_CALL(irecv
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
MCA_COLL_BASE_TAG_ALLTOALL, comm,
MCA_COLL_BASE_TAG_ALLTOALL, comm,
&reqs[completed]));
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
++nrreqs;
@ -493,7 +450,7 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
MCA_PML_BASE_SEND_STANDARD, comm,
&reqs[completed]));
++nsreqs;
si = (si + size - 1) % size;
si = (si + size - 1) % size;
}
}
}
@ -506,15 +463,15 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
rank));
if (NULL != reqs) free(reqs);
return error;
}
int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
int ompi_coll_base_alltoall_intra_two_procs(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -526,14 +483,14 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
ptrdiff_t sext, rext, lb;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_alltoall_intra_two_procs rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_alltoall_intra_two_procs rank %d", rank));
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
@ -548,17 +505,17 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
tmprecv = (char*)rbuf + (ptrdiff_t)remote * rext * (ptrdiff_t)rcount;
/* send and receive */
err = ompi_coll_tuned_sendrecv ( tmpsend, scount, sdtype, remote,
err = ompi_coll_base_sendrecv ( tmpsend, scount, sdtype, remote,
MCA_COLL_BASE_TAG_ALLTOALL,
tmprecv, rcount, rdtype, remote,
tmprecv, rcount, rdtype, remote,
MCA_COLL_BASE_TAG_ALLTOALL,
comm, MPI_STATUS_IGNORE, rank );
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* ddt sendrecv your own data */
err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount,
(int32_t) scount, sdtype,
(char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount,
err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount,
(int32_t) scount, sdtype,
(char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount,
(int32_t) rcount, rdtype);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
@ -566,7 +523,7 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
rank));
return err;
@ -577,8 +534,8 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
@ -588,22 +545,22 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
/* copied function (with appropriate renaming) starts here */
int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
int ompi_coll_base_alltoall_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, err, nreqs;
char *psnd, *prcv;
MPI_Aint lb, sndinc, rcvinc;
ompi_request_t **req, **sreq, **rreq;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype,
comm, module);
}
@ -612,9 +569,8 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_alltoall_intra_basic_linear rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_alltoall_intra_basic_linear rank %d", rank));
err = ompi_datatype_get_extent(sdtype, &lb, &sndinc);
if (OMPI_SUCCESS != err) {
@ -646,44 +602,41 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
/* Initiate all send/recv to/from others. */
req = rreq = data->mcct_reqs;
sreq = rreq + size - 1;
req = rreq = coll_base_comm_get_reqs(data, (size - 1) * 2);
prcv = (char *) rbuf;
psnd = (char *) sbuf;
/* Post all receives first -- a simple optimization */
for (nreqs = 0, i = (rank + 1) % size; i != rank;
for (nreqs = 0, i = (rank + 1) % size; i != rank;
i = (i + 1) % size, ++rreq, ++nreqs) {
err =
MCA_PML_CALL(irecv_init
(prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i,
MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
err = MCA_PML_CALL(irecv_init
(prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i,
MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
if (MPI_SUCCESS != err) {
ompi_coll_tuned_free_reqs(req, rreq - req);
ompi_coll_base_free_reqs(req, nreqs);
return err;
}
}
/* Now post all sends in reverse order
/* Now post all sends in reverse order
- We would like to minimize the search time through message queue
when messages actually arrive in the order in which they were posted.
*/
for (nreqs = 0, i = (rank + size - 1) % size; i != rank;
sreq = rreq;
for (i = (rank + size - 1) % size; i != rank;
i = (i + size - 1) % size, ++sreq, ++nreqs) {
err =
MCA_PML_CALL(isend_init
(psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i,
MCA_COLL_BASE_TAG_ALLTOALL,
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
err = MCA_PML_CALL(isend_init
(psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i,
MCA_COLL_BASE_TAG_ALLTOALL,
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
if (MPI_SUCCESS != err) {
ompi_coll_tuned_free_reqs(req, sreq - req);
ompi_coll_base_free_reqs(req, nreqs);
return err;
}
}
nreqs = (size - 1) * 2;
/* Start your engines. This will never return an error. */
MCA_PML_CALL(start(nreqs, req));
@ -698,165 +651,10 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE);
/* Free the reqs */
ompi_coll_tuned_free_reqs(req, nreqs);
ompi_coll_base_free_reqs(req, nreqs);
/* All done */
return err;
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t*new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = coll_tuned_alltoall_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_count",
"Number of alltoall algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_alltoall_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_alltoall_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_alltoall_algorithms", alltoall_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm",
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: linear with sync, 5:two proc only.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_alltoall_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_segmentsize",
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_segment_size);
coll_tuned_alltoall_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_tree_fanout",
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_tree_fanout);
coll_tuned_alltoall_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_chain_fanout",
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_chain_fanout);
coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
mca_param_indices->max_requests_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_max_requests",
"Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_max_requests);
if (mca_param_indices->max_requests_param_index < 0) {
return mca_param_indices->max_requests_param_index;
}
if (coll_tuned_alltoall_max_requests < 0) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to system level default %d \n",
ompi_coll_tuned_init_max_requests );
}
coll_tuned_alltoall_max_requests = 0;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d",
data->user_forced[ALLTOALL].algorithm));
switch (data->user_forced[ALLTOALL].algorithm) {
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (3): return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (4): return ompi_coll_tuned_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, data->user_forced[ALLTOALL].max_requests);
case (5): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize,
int max_requests)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (3): return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (4): return ompi_coll_tuned_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, max_requests);
case (5): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -32,29 +32,17 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/* alltoallv algorithm variables */
static int coll_tuned_alltoallv_algorithm_count = 2;
static int coll_tuned_alltoallv_forced_algorithm = 0;
/* valid values for coll_tuned_alltoallv_forced_algorithm */
static mca_base_var_enum_value_t alltoallv_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "pairwise"},
{0, NULL}
};
static int
mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
int
mca_coll_base_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
int i, j, size, rank, err=MPI_SUCCESS;
MPI_Request *preq;
char *tmp_buffer;
@ -90,7 +78,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con
for (i = 0 ; i < size ; ++i) {
for (j = i+1 ; j < size ; ++j) {
/* Initiate all send/recv to/from others. */
preq = tuned_module->tuned_data->mcct_reqs;
preq = coll_base_comm_get_reqs(base_module->base_data, 2);
if (i == rank && rcounts[j]) {
/* Copy the data into the temporary buffer */
@ -127,11 +115,8 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con
}
/* Wait for the requests to complete */
err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUSES_IGNORE);
err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Free the requests. */
mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2);
}
}
@ -145,7 +130,7 @@ mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, con
}
int
ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
ompi_coll_base_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
@ -157,15 +142,15 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
ptrdiff_t sext, rext;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
return mca_coll_base_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
rdtype, comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_pairwise rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:alltoallv_intra_pairwise rank %d", rank));
ompi_datatype_type_extent(sdtype, &sext);
ompi_datatype_type_extent(rdtype, &rext);
@ -182,34 +167,33 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
prcv = (char*)rbuf + (ptrdiff_t)rdisps[recvfrom] * rext;
/* send and receive */
err = ompi_coll_tuned_sendrecv( psnd, scounts[sendto], sdtype, sendto,
err = ompi_coll_base_sendrecv( psnd, scounts[sendto], sdtype, sendto,
MCA_COLL_BASE_TAG_ALLTOALLV,
prcv, rcounts[recvfrom], rdtype, recvfrom,
prcv, rcounts[recvfrom], rdtype, recvfrom,
MCA_COLL_BASE_TAG_ALLTOALLV,
comm, MPI_STATUS_IGNORE, rank);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"%s:%4d\tError occurred %d, rank %2d at step %d", __FILE__, line,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d at step %d", __FILE__, line,
err, rank, step));
return err;
}
/*
/**
* Linear functions are copied from the basic coll module. For
* some small number of nodes and/or small data sizes they are just as
* fast as tuned/tree based segmenting operations and as such may be
* fast as base/tree based segmenting operations and as such may be
* selected by the decision functions. These are copied into this module
* due to the way we select modules in V1. i.e. in V2 we will handle this
* differently and so will not have to duplicate code.
* GEF Oct05 after asking Jeff.
* differently and so will not have to duplicate code.
*/
int
ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps,
ompi_coll_base_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
@ -220,19 +204,19 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
char *psnd, *prcv;
ptrdiff_t sext, rext;
MPI_Request *preq;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
if (MPI_IN_PLACE == sbuf) {
return mca_coll_tuned_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
return mca_coll_base_alltoallv_intra_basic_inplace (rbuf, rcounts, rdisps,
rdtype, comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_basic_linear rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:alltoallv_intra_basic_linear rank %d", rank));
ompi_datatype_type_extent(sdtype, &sext);
ompi_datatype_type_extent(rdtype, &rext);
@ -255,7 +239,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
/* Now, initiate all send/recv to/from others. */
nreqs = 0;
preq = data->mcct_reqs;
preq = coll_base_comm_get_reqs(data, 2 * size);
/* Post all receives first */
for (i = 0; i < size; ++i) {
@ -269,7 +253,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
preq++));
++nreqs;
if (MPI_SUCCESS != err) {
ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
return err;
}
}
@ -287,7 +271,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
preq++));
++nreqs;
if (MPI_SUCCESS != err) {
ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
return err;
}
}
@ -305,128 +289,7 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
MPI_STATUSES_IGNORE);
/* Free the requests. */
ompi_coll_tuned_free_reqs(data->mcct_reqs, nreqs);
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
return err;
}
/*
* The following are used by dynamic and forced rules. Publish
* details of each algorithm and if its forced/fixed/locked in as you add
* methods/algorithms you must update this and the query/map routines.
* This routine is called by the component only. This makes sure that
* the mca parameters are set to their initial values and perms.
* Module does not call this. They call the forced_getvalues routine
* instead.
*/
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t
*mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV] = coll_tuned_alltoallv_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoallv_algorithm_count",
"Number of alltoallv algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_alltoallv_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_alltoallv_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_alltoallv_algorithms", alltoallv_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoallv_algorithm",
"Which alltoallv algorithm is used. "
"Can be locked down to choice of: 0 ignore, "
"1 basic linear, 2 pairwise.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoallv_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_alltoallv_intra_do_forced(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_do_forced selected algorithm %d",
data->user_forced[ALLTOALLV].algorithm));
switch (data->user_forced[ALLTOALLV].algorithm) {
case (0):
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_do_forced attempt to "
"select algorithm %d when only 0-%d is valid.",
data->user_forced[ALLTOALLV].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
return (MPI_ERR_ARG);
}
}
/* If the user selects dynamic rules and specifies the algorithm to
* use, then this function is called. */
int ompi_coll_tuned_alltoallv_intra_do_this(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_do_this selected algorithm %d ",
algorithm));
switch (algorithm) {
case (0):
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (1):
return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (2):
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoall_intra_do_this attempt to select "
"algorithm %d when only 0-%d is valid.",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
return (MPI_ERR_ARG);
}
}

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -31,25 +31,9 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* barrier algorithm variables */
static int coll_tuned_barrier_algorithm_count = 6;
static int coll_tuned_barrier_forced_algorithm = 0;
/* valid values for coll_tuned_barrier_forced_algorithm */
static mca_base_var_enum_value_t barrier_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "double_ring"},
{3, "recursive_doubling"},
{4, "bruck"},
{5, "two_proc"},
{6, "tree"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/**
* A quick version of the MPI_Sendreceive implemented for the barrier.
@ -57,7 +41,7 @@ static mca_base_var_enum_value_t barrier_algorithms[] = {
* signal a two peer synchronization.
*/
static inline int
ompi_coll_tuned_sendrecv_zero(int dest, int stag,
ompi_coll_base_sendrecv_zero(int dest, int stag,
int source, int rtag,
MPI_Comm comm)
@ -87,8 +71,8 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag,
err_index = 1;
}
err = statuses[err_index].MPI_ERROR;
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s"
" stage of ompi_coll_tuned_sendrecv_zero\n",
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
" stage of ompi_coll_base_sendrecv_zero\n",
__FILE__, line, err, (0 == err_index ? "receive" : "send")));
return err;
}
@ -100,21 +84,21 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag,
/* Error discovered during the posting of the irecv or isend,
* and no status is available.
*/
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n",
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
__FILE__, line, err));
return err;
}
/*
* Barrier is ment to be a synchronous operation, as some BTLs can mark
* a request done before its passed to the NIC and progress might not be made
* elsewhere we cannot allow a process to exit the barrier until its last
* Barrier is ment to be a synchronous operation, as some BTLs can mark
* a request done before its passed to the NIC and progress might not be made
* elsewhere we cannot allow a process to exit the barrier until its last
* [round of] sends are completed.
*
* It is last round of sends rather than 'last' individual send as each pair of
* peers can use different channels/devices/btls and the receiver of one of
* It is last round of sends rather than 'last' individual send as each pair of
* peers can use different channels/devices/btls and the receiver of one of
* these sends might be forced to wait as the sender
* leaves the collective and does not make progress until the next mpi call
* leaves the collective and does not make progress until the next mpi call
*
*/
@ -124,7 +108,7 @@ ompi_coll_tuned_sendrecv_zero(int dest, int stag,
* synchronous gurantee made by last ring of sends are synchronous
*
*/
int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, size, err = 0, line = 0, left, right;
@ -132,50 +116,50 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_barrier_intra_doublering rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));
left = ((rank-1)%size);
right = ((rank+1)%size);
if (rank > 0) { /* receive message from the left */
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
/* Send message to the right */
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
MCA_COLL_BASE_TAG_BARRIER,
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* root needs to receive from the last node */
if (rank == 0) {
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
/* Allow nodes to exit */
if (rank > 0) { /* post Receive from left */
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
/* send message to the right one */
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
MCA_COLL_BASE_TAG_BARRIER,
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
/* rank 0 post receive from the last node */
if (rank == 0) {
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
}
@ -183,7 +167,7 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -193,15 +177,15 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
* To make synchronous, uses sync sends and sync sendrecvs
*/
int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, size, adjsize, err, line, mask, remote;
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_barrier_intra_recursivedoubling rank %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_barrier_intra_recursivedoubling rank %d",
rank));
/* do nearest power of 2 less than size calc */
@ -213,7 +197,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
if (rank >= adjsize) {
/* send message to lower ranked node */
remote = rank - adjsize;
err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
remote, MCA_COLL_BASE_TAG_BARRIER,
comm);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -222,7 +206,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
/* receive message from high level rank */
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
MCA_COLL_BASE_TAG_BARRIER, comm,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -238,7 +222,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
if (remote >= adjsize) continue;
/* post receive from the remote node */
err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
remote, MCA_COLL_BASE_TAG_BARRIER,
comm);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -250,8 +234,8 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
if (rank < (size - adjsize)) {
/* send enter message to higher ranked node */
remote = rank + adjsize;
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
MCA_COLL_BASE_TAG_BARRIER,
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -261,7 +245,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -271,23 +255,23 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
* To make synchronous, uses sync sends and sync sendrecvs
*/
int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, size, distance, to, from, err, line = 0;
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_barrier_intra_bruck rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_barrier_intra_bruck rank %d", rank));
/* exchange data with rank-2^k and rank+2^k */
for (distance = 1; distance < size; distance <<= 1) {
for (distance = 1; distance < size; distance <<= 1) {
from = (rank + size - distance) % size;
to = (rank + distance) % size;
/* send message to lower ranked node */
err = ompi_coll_tuned_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
err = ompi_coll_base_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
from, MCA_COLL_BASE_TAG_BARRIER,
comm);
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
@ -296,7 +280,7 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
return MPI_SUCCESS;
err_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -306,17 +290,17 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
* To make synchronous, uses sync sends and sync sendrecvs
*/
/* special case for two processes */
int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int remote, err;
remote = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_barrier_intra_two_procs rank %d", remote));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_barrier_intra_two_procs rank %d", remote));
remote = (remote + 1) & 0x1;
err = ompi_coll_tuned_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
remote, MCA_COLL_BASE_TAG_BARRIER,
comm);
return (err);
@ -327,7 +311,7 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
@ -337,8 +321,8 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
/* copied function (with appropriate renaming) starts here */
static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, err, rank, size;
@ -347,14 +331,14 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
/* All non-root send & receive zero-length message. */
if (rank > 0) {
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err) {
return err;
}
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
MCA_COLL_BASE_TAG_BARRIER,
comm, MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err) {
@ -370,7 +354,7 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
requests = (ompi_request_t**)malloc( size * sizeof(ompi_request_t*) );
for (i = 1; i < size; ++i) {
err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
MCA_COLL_BASE_TAG_BARRIER, comm,
MCA_COLL_BASE_TAG_BARRIER, comm,
&(requests[i])));
if (MPI_SUCCESS != err) {
return err;
@ -379,15 +363,14 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );
for (i = 1; i < size; ++i) {
err = MCA_PML_CALL(isend(NULL, 0, MPI_BYTE, i,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm,
&(requests[i])));
err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, i,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err) {
return err;
}
}
ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );
free( requests );
}
@ -400,17 +383,17 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
/*
* Another recursive doubling type algorithm, but in this case
* we go up the tree and back down the tree.
* we go up the tree and back down the tree.
*/
int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, size, depth, err, jump, partner;
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_barrier_intra_tree %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_barrier_intra_tree %d",
rank));
/* Find the nearest power of 2 of the communicator size. */
@ -420,21 +403,21 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
partner = rank ^ jump;
if (!(partner & (jump-1)) && partner < size) {
if (partner > rank) {
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err)
return err;
} else if (partner < rank) {
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner,
MCA_COLL_BASE_TAG_BARRIER,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err)
return err;
}
}
}
depth >>= 1;
for (jump = depth; jump>0; jump>>=1) {
partner = rank ^ jump;
@ -446,7 +429,7 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
if (MPI_SUCCESS != err)
return err;
} else if (partner < rank) {
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner,
MCA_COLL_BASE_TAG_BARRIER, comm,
MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err)
@ -457,101 +440,3 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
return MPI_SUCCESS;
}
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map */
/* routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values */
/* and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[BARRIER] = coll_tuned_barrier_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm_count",
"Number of barrier algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_barrier_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_barrier_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_barrier_algorithms", barrier_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm",
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_barrier_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:barrier_intra_do_forced selected algorithm %d",
data->user_forced[BARRIER].algorithm));
switch (data->user_forced[BARRIER].algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module);
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm, module);
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm, module);
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm, module);
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm, module);
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm, module);
case (6): return ompi_coll_tuned_barrier_intra_tree (comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[BARRIER].algorithm,
ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout));
switch (algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module);
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm, module);
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm, module);
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm, module);
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm, module);
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm, module);
case (6): return ompi_coll_tuned_barrier_intra_tree (comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -3,18 +3,18 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -27,33 +27,14 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* bcast algorithm variables */
static int coll_tuned_bcast_algorithm_count = 6;
static int coll_tuned_bcast_forced_algorithm = 0;
static int coll_tuned_bcast_segment_size = 0;
static int coll_tuned_bcast_tree_fanout;
static int coll_tuned_bcast_chain_fanout;
/* valid values for coll_tuned_bcast_forced_algorithm */
static mca_base_var_enum_value_t bcast_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "chain"},
{3, "pipeline"},
{4, "split_binary_tree"},
{5, "binary_tree"},
{6, "binomial"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
int
ompi_coll_tuned_bcast_intra_generic( void* buffer,
int original_count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_generic( void* buffer,
int original_count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -62,12 +43,12 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
{
int err = 0, line, i, rank, size, segindex, req_index;
int num_segments; /* Number of segments */
int sendcount; /* number of elements sent in this segment */
int sendcount; /* number of elements sent in this segment */
size_t realsegsize, type_size;
char *tmpbuf;
ptrdiff_t extent, lb;
ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
ompi_request_t **send_reqs = NULL;
#endif
@ -79,20 +60,20 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
ompi_datatype_type_size( datatype, &type_size );
num_segments = (original_count + count_by_segment - 1) / count_by_segment;
realsegsize = (ptrdiff_t)count_by_segment * extent;
/* Set the buffer pointers */
tmpbuf = (char *) buffer;
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
if( tree->tree_nextsize != 0 ) {
send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize *
send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize *
sizeof(ompi_request_t*) );
}
#endif
/* Root code */
if( rank == root ) {
/*
/*
For each segment:
- send segment to all children.
The last segment may have less elements than other segments.
@ -102,39 +83,39 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
if( segindex == (num_segments - 1) ) {
sendcount = original_count - segindex * count_by_segment;
}
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_BASE_BCAST_USE_BLOCKING)
err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
#else
err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm,
MCA_PML_BASE_SEND_STANDARD, comm,
&send_reqs[i]));
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
}
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
/* complete the sends before starting the next sends */
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
MPI_STATUSES_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
#endif /* not COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* not COLL_BASE_BCAST_USE_BLOCKING */
/* update tmp buffer */
tmpbuf += realsegsize;
}
}
}
/* Intermediate nodes code */
else if( tree->tree_nextsize > 0 ) {
/*
Create the pipeline.
else if( tree->tree_nextsize > 0 ) {
/*
Create the pipeline.
1) Post the first receive
2) For segments 1 .. num_segments
- post new receive
@ -149,49 +130,49 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
comm, &recv_reqs[req_index]));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
for( segindex = 1; segindex < num_segments; segindex++ ) {
req_index = req_index ^ 0x1;
/* post new irecv */
err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, count_by_segment,
datatype, tree->tree_prev,
MCA_COLL_BASE_TAG_BCAST,
datatype, tree->tree_prev,
MCA_COLL_BASE_TAG_BCAST,
comm, &recv_reqs[req_index]));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* wait for and forward the previous segment to children */
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
MPI_STATUSES_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_BASE_BCAST_USE_BLOCKING)
err = MCA_PML_CALL(send(tmpbuf, count_by_segment, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
#else
err = MCA_PML_CALL(isend(tmpbuf, count_by_segment, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm,
MCA_PML_BASE_SEND_STANDARD, comm,
&send_reqs[i]));
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
}
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
/* complete the sends before starting the next iteration */
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
MPI_STATUSES_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
/* Update the receive buffer */
tmpbuf += realsegsize;
}
/* Process the last segment */
@ -199,31 +180,31 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
sendcount = original_count - (ptrdiff_t)(num_segments - 1) * count_by_segment;
for( i = 0; i < tree->tree_nextsize; i++ ) {
#if defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if defined(COLL_BASE_BCAST_USE_BLOCKING)
err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
#else
err = MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
tree->tree_next[i],
tree->tree_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm,
MCA_PML_BASE_SEND_STANDARD, comm,
&send_reqs[i]));
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
err = ompi_request_wait_all( tree->tree_nextsize, send_reqs,
MPI_STATUSES_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
#endif /* COLL_TUNED_BCAST_USE_BLOCKING */
#endif /* COLL_BASE_BCAST_USE_BLOCKING */
}
/* Leaf nodes */
else {
/*
/*
Receive all segments from parent in a loop:
1) post irecv for the first segment
2) for segments 1 .. num_segments
@ -241,12 +222,12 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
req_index = req_index ^ 0x1;
tmpbuf += realsegsize;
/* post receive for the next segment */
err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
err = MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
comm, &recv_reqs[req_index]));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* wait on the previous segment */
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
err = ompi_request_wait( &recv_reqs[req_index ^ 0x1],
MPI_STATUS_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
@ -255,25 +236,25 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
if( NULL != send_reqs ) free(send_reqs);
#endif
return (MPI_SUCCESS);
error_hndl:
OPAL_OUTPUT( (ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank) );
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
if( NULL != send_reqs ) free(send_reqs);
#endif
return (err);
}
int
ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_bintree ( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -281,28 +262,27 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_BINTREE( comm, module, root );
/**
* Determine number of elements sent per operation.
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binary rank %d ss %5d typelng %lu segcount %d",
ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
segcount, data->cached_bintree );
}
int
ompi_coll_tuned_bcast_intra_pipeline( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_pipeline( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -310,28 +290,27 @@ ompi_coll_tuned_bcast_intra_pipeline( void* buffer,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root );
COLL_BASE_UPDATE_PIPELINE( comm, module, root );
/**
* Determine number of elements sent per operation.
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_pipeline rank %d ss %5d typelng %lu segcount %d",
ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
segcount, data->cached_pipeline );
}
int
ompi_coll_tuned_bcast_intra_chain( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_chain( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -339,28 +318,27 @@ ompi_coll_tuned_bcast_intra_chain( void* buffer,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
COLL_TUNED_UPDATE_CHAIN( comm, tuned_module, root, chains );
COLL_BASE_UPDATE_CHAIN( comm, module, root, chains );
/**
* Determine number of elements sent per operation.
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d",
ompi_comm_rank(comm), chains, segsize, (unsigned long)typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
segcount, data->cached_chain );
}
int
ompi_coll_tuned_bcast_intra_binomial( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_binomial( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -368,28 +346,27 @@ ompi_coll_tuned_bcast_intra_binomial( void* buffer,
{
int segcount = count;
size_t typelng;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
COLL_TUNED_UPDATE_BMTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_BMTREE( comm, module, root );
/**
* Determine number of elements sent per operation.
*/
ompi_datatype_type_size( datatype, &typelng );
COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
COLL_BASE_COMPUTED_SEGCOUNT( segsize, typelng, segcount );
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_binomial rank %d ss %5d typelng %lu segcount %d",
ompi_comm_rank(comm), segsize, (unsigned long)typelng, segcount));
return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
segcount, data->cached_bmtree );
}
int
ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
int count,
struct ompi_datatype_t* datatype,
ompi_coll_base_bcast_intra_split_bintree ( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
mca_coll_base_module_t *module,
@ -399,26 +376,25 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
uint32_t counts[2];
int segcount[2]; /* Number of elements sent with each segment */
int num_segments[2]; /* Number of segmenets */
int sendcount[2]; /* the same like segcount, except for the last segment */
int sendcount[2]; /* the same like segcount, except for the last segment */
size_t realsegsize[2], type_size;
char *tmpbuf[2];
ptrdiff_t type_extent, lb;
ompi_request_t *base_req, *new_req;
ompi_coll_tree_t *tree;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_comm_t *data = module->base_data;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_bcast_intra_split_bintree rank %d root %d ss %5d", rank, root, segsize));
if (size == 1) {
return MPI_SUCCESS;
}
/* setup the binary tree topology. */
COLL_TUNED_UPDATE_BINTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_BINTREE( comm, module, root );
tree = data->cached_bintree;
err = ompi_datatype_type_size( datatype, &type_size );
@ -431,10 +407,10 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
/* Note that ompi_datatype_type_size() will never return a negative
value in typelng; it returns an int [vs. an unsigned type]
because of the MPI spec. */
if (segsize < ((uint32_t) type_size)) {
if (segsize < ((uint32_t) type_size)) {
segsize = type_size; /* push segsize up to hold one type */
}
segcount[0] = segcount[1] = segsize / type_size;
segcount[0] = segcount[1] = segsize / type_size;
num_segments[0] = counts[0]/segcount[0];
if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
num_segments[1] = counts[1]/segcount[1];
@ -450,17 +426,17 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
(segsize > ((ptrdiff_t)counts[0] * type_size)) ||
(segsize > ((ptrdiff_t)counts[1] * type_size)) ) {
/* call linear version here ! */
return (ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype,
return (ompi_coll_base_bcast_intra_chain ( buffer, count, datatype,
root, comm, module,
segsize, 1 ));
}
err = ompi_datatype_get_extent (datatype, &lb, &type_extent);
/* Determine real segment size */
realsegsize[0] = (ptrdiff_t)segcount[0] * type_extent;
realsegsize[1] = (ptrdiff_t)segcount[1] * type_extent;
/* set the buffer pointers */
tmpbuf[0] = (char *) buffer;
tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent;
@ -473,11 +449,11 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
/* determine if I am left (0) or right (1), (root is right) */
lr = ((rank + size - root)%size + 1)%2;
/* root code */
if( rank == root ) {
/* determine segment count */
sendcount[0] = segcount[0];
sendcount[0] = segcount[0];
sendcount[1] = segcount[1];
/* for each segment */
for (segindex = 0; segindex < num_segments[0]; segindex++) {
@ -487,7 +463,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
continue;
}
/* determine how many elements are being sent in this round */
if(segindex == (num_segments[i] - 1))
if(segindex == (num_segments[i] - 1))
sendcount[i] = counts[i] - segindex*segcount[i];
/* send data */
MCA_PML_CALL(send(tmpbuf[i], sendcount[i], datatype,
@ -498,19 +474,19 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
tmpbuf[i] += realsegsize[i];
}
}
}
}
/* intermediate nodes code */
else if( tree->tree_nextsize > 0 ) {
else if( tree->tree_nextsize > 0 ) {
/* Intermediate nodes:
* It will receive segments only from one half of the data.
* Which one is determined by whether the node belongs to the "left" or "right"
* Which one is determined by whether the node belongs to the "left" or "right"
* subtree. Topoloby building function builds binary tree such that
* odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
* and even on the right subtree.
*
* Create the pipeline. We first post the first receive, then in the loop we
* post the next receive and after that wait for the previous receive to complete
* post the next receive and after that wait for the previous receive to complete
* and we disseminating the data to all children.
*/
sendcount[lr] = segcount[lr];
@ -521,11 +497,11 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
/* determine how many elements to expect in this round */
if( segindex == (num_segments[lr] - 1))
if( segindex == (num_segments[lr] - 1))
sendcount[lr] = counts[lr] - (ptrdiff_t)segindex * (ptrdiff_t)segcount[lr];
/* post new irecv */
err = MCA_PML_CALL(irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
comm, &new_req));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
@ -539,7 +515,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
} /* end of for each child */
/* upate the base request */
base_req = new_req;
base_req = new_req;
/* go to the next buffer (ie. the one corresponding to the next recv) */
tmpbuf[lr] += realsegsize[lr];
} /* end of for segindex */
@ -552,10 +528,10 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
} /* end of for each child */
}
}
/* leaf nodes */
else {
else {
/* Just consume segments as fast as possible */
sendcount[lr] = segcount[lr];
for (segindex = 0; segindex < num_segments[lr]; segindex++) {
@ -577,9 +553,9 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
tmpbuf[1] = (char *) buffer + (ptrdiff_t)counts[0] * type_extent;
/* Step 2:
Find your immediate pair (identical node in opposite subtree) and SendRecv
Find your immediate pair (identical node in opposite subtree) and SendRecv
data buffer with them.
The tree building function ensures that
The tree building function ensures that
if (we are not root)
if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
if we are in the right subtree (lr == 1) our pair is (rank-1)%size
@ -591,9 +567,9 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
pair = (rank+size-1)%size;
}
if ( (size%2) != 0 && rank != root) {
if ( (size%2) != 0 && rank != root) {
err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
@ -607,28 +583,28 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
}
/* last node receives right buffer from the root */
else if (rank == (root+size-1)%size) {
err = MCA_PML_CALL(recv(tmpbuf[1], counts[1], datatype,
root, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
}
/* everyone else exchanges buffers */
else {
err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
err = ompi_coll_base_sendrecv( tmpbuf[lr], counts[lr], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE, rank);
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
}
return (MPI_SUCCESS);
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
return (err);
}
@ -636,8 +612,8 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
@ -655,21 +631,20 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
struct ompi_datatype_t *datatype, int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
ompi_coll_base_bcast_intra_basic_linear(void *buff, int count,
struct ompi_datatype_t *datatype, int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, size, rank, err;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
ompi_request_t **preq, **reqs = data->mcct_reqs;
mca_coll_base_comm_t *data = module->base_data;
ompi_request_t **preq, **reqs;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_basic_linear rank %d root %d", rank, root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_bcast_intra_basic_linear rank %d root %d", rank, root));
/* Non-root receive the data. */
@ -680,8 +655,8 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
}
/* Root sends data to all others. */
for (i = 0, preq = reqs; i < size; ++i) {
preq = reqs = coll_base_comm_get_reqs(data, size-1);
for (i = 0; i < size; ++i) {
if (i == rank) {
continue;
}
@ -691,6 +666,7 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
MCA_PML_BASE_SEND_STANDARD,
comm, preq++));
if (MPI_SUCCESS != err) {
ompi_coll_base_free_reqs(data->mcct_reqs, i);
return err;
}
}
@ -710,148 +686,11 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE);
/* Free the reqs */
ompi_coll_tuned_free_reqs(reqs, i);
ompi_coll_base_free_reqs(reqs, i);
/* All done */
return err;
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[BCAST] = coll_tuned_bcast_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_count",
"Number of bcast algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_bcast_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_bcast_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_bcast_algorithms", bcast_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm",
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: binomial tree.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_bcast_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_segmentsize",
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_segment_size);
coll_tuned_bcast_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_tree_fanout",
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_tree_fanout);
coll_tuned_bcast_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_chain_fanout",
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d",
data->user_forced[BCAST].algorithm));
switch (data->user_forced[BCAST].algorithm) {
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
case (1): return ompi_coll_tuned_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
case (2): return ompi_coll_tuned_bcast_intra_chain( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize,
data->user_forced[BCAST].chain_fanout );
case (3): return ompi_coll_tuned_bcast_intra_pipeline( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize );
case (4): return ompi_coll_tuned_bcast_intra_split_bintree( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize );
case (5): return ompi_coll_tuned_bcast_intra_bintree( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize );
case (6): return ompi_coll_tuned_bcast_intra_binomial( buf, count, dtype, root, comm, module,
data->user_forced[BCAST].segsize );
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
} /* switch */
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
case (1): return ompi_coll_tuned_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
case (2): return ompi_coll_tuned_bcast_intra_chain( buf, count, dtype, root, comm, module, segsize, faninout );
case (3): return ompi_coll_tuned_bcast_intra_pipeline( buf, count, dtype, root, comm, module, segsize );
case (4): return ompi_coll_tuned_bcast_intra_split_bintree( buf, count, dtype, root, comm, module, segsize );
case (5): return ompi_coll_tuned_bcast_intra_bintree( buf, count, dtype, root, comm, module, segsize );
case (6): return ompi_coll_tuned_bcast_intra_binomial( buf, count, dtype, root, comm, module, segsize );
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
} /* switch */
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -3,10 +3,10 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
@ -15,9 +15,9 @@
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -33,6 +33,7 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
/*
* The following file was created by configure. It contains extern
@ -49,10 +50,94 @@ static void coll_base_module_construct(mca_coll_base_module_t *m)
/* zero out all functions */
memset ((char *) m + sizeof (m->super), 0, sizeof (*m) - sizeof (m->super));
m->coll_module_disable = NULL;
m->base_data = NULL;
}
OBJ_CLASS_INSTANCE(mca_coll_base_module_t, opal_object_t,
coll_base_module_construct, NULL);
static void
coll_base_module_destruct(mca_coll_base_module_t *module)
{
if (NULL != module->base_data) {
OBJ_RELEASE(module->base_data);
}
}
OBJ_CLASS_INSTANCE(mca_coll_base_module_t, opal_object_t,
coll_base_module_construct, coll_base_module_destruct);
static void
coll_base_comm_construct(mca_coll_base_comm_t *data)
{
data->mcct_reqs = NULL;
data->mcct_num_reqs = 0;
data->cached_ntree = NULL;
data->cached_bintree = NULL;
data->cached_bmtree = NULL;
data->cached_in_order_bmtree = NULL;
data->cached_chain = NULL;
data->cached_pipeline = NULL;
data->cached_in_order_bintree = NULL;
}
static void
coll_base_comm_destruct(mca_coll_base_comm_t *data)
{
if( NULL != data->mcct_reqs ) {
for( int i = 0; i < data->mcct_num_reqs; ++i ) {
if( MPI_REQUEST_NULL != data->mcct_reqs[i] )
ompi_request_free(&data->mcct_reqs[i]);
}
free(data->mcct_reqs);
data->mcct_reqs = NULL;
data->mcct_num_reqs = 0;
}
assert(0 == data->mcct_num_reqs);
/* free any cached information that has been allocated */
if (data->cached_ntree) { /* destroy general tree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_ntree);
}
if (data->cached_bintree) { /* destroy bintree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_bintree);
}
if (data->cached_bmtree) { /* destroy bmtree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_bmtree);
}
if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bmtree);
}
if (data->cached_chain) { /* destroy general chain if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_chain);
}
if (data->cached_pipeline) { /* destroy pipeline if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_pipeline);
}
if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
ompi_coll_base_topo_destroy_tree (&data->cached_in_order_bintree);
}
}
OBJ_CLASS_INSTANCE(mca_coll_base_comm_t, opal_object_t,
coll_base_comm_construct, coll_base_comm_destruct);
ompi_request_t** coll_base_comm_get_reqs(mca_coll_base_comm_t* data, int nreqs)
{
int startfrom = data->mcct_num_reqs;
if( NULL == data->mcct_reqs ) {
assert(0 == data->mcct_num_reqs);
data->mcct_reqs = (ompi_request_t**)malloc(sizeof(ompi_request_t*) * nreqs);
} else if( data->mcct_num_reqs <= nreqs ) {
data->mcct_reqs = (ompi_request_t**)realloc(data->mcct_reqs, sizeof(ompi_request_t*) * nreqs);
}
if( NULL != data->mcct_reqs ) {
data->mcct_num_reqs = nreqs;
for( int i = startfrom; i < data->mcct_num_reqs; i++ )
data->mcct_reqs[i] = MPI_REQUEST_NULL;
} else
data->mcct_num_reqs = 0; /* nothing to return */
return data->mcct_reqs;
}
MCA_BASE_FRAMEWORK_DECLARE(ompi, coll, "Collectives", NULL, NULL, NULL,
mca_coll_base_static_components, 0);

355
ompi/mca/coll/base/coll_base_functions.h Обычный файл
Просмотреть файл

@ -0,0 +1,355 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_BASE_EXPORT_H
#define MCA_COLL_BASE_EXPORT_H
#include "ompi_config.h"
#include "ompi/mca/coll/base/base.h"
#include "opal/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_base_topo.h"
/* some fixed value index vars to simplify certain operations */
typedef enum COLLTYPE {
ALLGATHER = 0, /* 0 */
ALLGATHERV, /* 1 */
ALLREDUCE, /* 2 */
ALLTOALL, /* 3 */
ALLTOALLV, /* 4 */
ALLTOALLW, /* 5 */
BARRIER, /* 6 */
BCAST, /* 7 */
EXSCAN, /* 8 */
GATHER, /* 9 */
GATHERV, /* 10 */
REDUCE, /* 11 */
REDUCESCATTER, /* 12 */
SCAN, /* 13 */
SCATTER, /* 14 */
SCATTERV, /* 15 */
COLLCOUNT /* 16 end counter keep it as last element */
} COLLTYPE_T;
/* defined arg lists to simply auto inclusion of user overriding decision functions */
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define BARRIER_ARGS struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
/* end defined arg lists to simply auto inclusion of user overriding decision functions */
BEGIN_C_DECLS
/* All Gather */
int ompi_coll_base_allgather_intra_bruck(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS);
int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS);
/* All GatherV */
int ompi_coll_base_allgatherv_intra_bruck(ALLGATHERV_ARGS);
int ompi_coll_base_allgatherv_intra_ring(ALLGATHERV_ARGS);
int ompi_coll_base_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
int ompi_coll_base_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
int ompi_coll_base_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
/* All Reduce */
int ompi_coll_base_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS);
int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
/* AlltoAll */
int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS);
int ompi_coll_base_alltoall_intra_bruck(ALLTOALL_ARGS);
int ompi_coll_base_alltoall_intra_basic_linear(ALLTOALL_ARGS);
int ompi_coll_base_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
int ompi_coll_base_alltoall_intra_two_procs(ALLTOALL_ARGS);
int mca_coll_base_alltoall_intra_basic_inplace(void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module); /* special version for INPLACE */
/* AlltoAllV */
int ompi_coll_base_alltoallv_intra_pairwise(ALLTOALLV_ARGS);
int ompi_coll_base_alltoallv_intra_basic_linear(ALLTOALLV_ARGS);
int mca_coll_base_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module); /* special version for INPLACE */
/* AlltoAllW */
/* Barrier */
int ompi_coll_base_barrier_intra_doublering(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_recursivedoubling(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_bruck(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_two_procs(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_tree(BARRIER_ARGS);
int ompi_coll_base_barrier_intra_basic_linear(BARRIER_ARGS);
/* Bcast */
int ompi_coll_base_bcast_intra_basic_linear(BCAST_ARGS);
int ompi_coll_base_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
int ompi_coll_base_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
int ompi_coll_base_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
int ompi_coll_base_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
int ompi_coll_base_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
/* Exscan */
/* Gather */
int ompi_coll_base_gather_intra_basic_linear(GATHER_ARGS);
int ompi_coll_base_gather_intra_binomial(GATHER_ARGS);
int ompi_coll_base_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
/* GatherV */
/* Reduce */
int ompi_coll_base_reduce_intra_basic_linear(REDUCE_ARGS);
int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
/* Reduce_scatter */
int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
/* Scan */
/* Scatter */
int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
/* ScatterV */
END_C_DECLS
#define COLL_BASE_UPDATE_BINTREE( OMPI_COMM, BASE_MODULE, ROOT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_bintree) \
&& (coll_comm->cached_bintree_root == (ROOT)) ) ) { \
if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_bintree) ); \
} \
coll_comm->cached_bintree = ompi_coll_base_topo_build_tree(2,(OMPI_COMM),(ROOT)); \
coll_comm->cached_bintree_root = (ROOT); \
} \
} while (0)
#define COLL_BASE_UPDATE_BMTREE( OMPI_COMM, BASE_MODULE, ROOT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_bmtree) \
&& (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \
if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \
} \
coll_comm->cached_bmtree = ompi_coll_base_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \
coll_comm->cached_bmtree_root = (ROOT); \
} \
} while (0)
#define COLL_BASE_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, BASE_MODULE, ROOT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_in_order_bmtree) \
&& (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \
if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \
} \
coll_comm->cached_in_order_bmtree = ompi_coll_base_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \
coll_comm->cached_in_order_bmtree_root = (ROOT); \
} \
} while (0)
#define COLL_BASE_UPDATE_PIPELINE( OMPI_COMM, BASE_MODULE, ROOT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_pipeline) \
&& (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \
if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \
} \
coll_comm->cached_pipeline = ompi_coll_base_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \
coll_comm->cached_pipeline_root = (ROOT); \
} \
} while (0)
#define COLL_BASE_UPDATE_CHAIN( OMPI_COMM, BASE_MODULE, ROOT, FANOUT ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !( (coll_comm->cached_chain) \
&& (coll_comm->cached_chain_root == (ROOT)) \
&& (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \
if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \
ompi_coll_base_topo_destroy_tree( &(coll_comm->cached_chain) ); \
} \
coll_comm->cached_chain = ompi_coll_base_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \
coll_comm->cached_chain_root = (ROOT); \
coll_comm->cached_chain_fanout = (FANOUT); \
} \
} while (0)
#define COLL_BASE_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, BASE_MODULE ) \
do { \
mca_coll_base_comm_t* coll_comm = (BASE_MODULE)->base_data; \
if( !(coll_comm->cached_in_order_bintree) ) { \
/* In-order binary tree topology is defined by communicator size */ \
/* Thus, there is no need to destroy anything */ \
coll_comm->cached_in_order_bintree = \
ompi_coll_base_topo_build_in_order_bintree((OMPI_COMM)); \
} \
} while (0)
/**
* This macro give a generic way to compute the best count of
* the segment (i.e. the number of complete datatypes that
* can fit in the specified SEGSIZE). Beware, when this macro
* is called, the SEGCOUNT should be initialized to the count as
* expected by the collective call.
*/
#define COLL_BASE_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \
if( ((SEGSIZE) >= (TYPELNG)) && \
((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \
size_t residual; \
(SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \
residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \
if( residual > ((TYPELNG) >> 1) ) \
(SEGCOUNT)++; \
} \
/**
* This macro gives a generic wait to compute the well distributed block counts
* when the count and number of blocks are fixed.
* Macro returns "early-block" count, "late-block" count, and "split-index"
* which is the block at which we switch from "early-block" count to
* the "late-block" count.
* count = split_index * early_block_count +
* (block_count - split_index) * late_block_count
* We do not perform ANY error checks - make sure that the input values
* make sense (eg. count > num_blocks).
*/
#define COLL_BASE_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \
EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \
SPLIT_INDEX = COUNT % NUM_BLOCKS; \
if (0 != SPLIT_INDEX) { \
EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \
} \
/*
* Data structure for hanging data off the communicator
* i.e. per module instance
*/
struct mca_coll_base_comm_t {
opal_object_t super;
/* standard data for requests and PML usage */
/* Precreate space for requests
* Note this does not effect basic,
* but if in wrong context can confuse a debugger
* this is controlled by an MCA param
*/
ompi_request_t **mcct_reqs;
int mcct_num_reqs;
/*
* base topo information caching per communicator
*
* for each communicator we cache the topo information so we can
* reuse without regenerating if we change the root, [or fanout]
* then regenerate and recache this information
*/
/* general tree with n fan out */
ompi_coll_tree_t *cached_ntree;
int cached_ntree_root;
int cached_ntree_fanout;
/* binary tree */
ompi_coll_tree_t *cached_bintree;
int cached_bintree_root;
/* binomial tree */
ompi_coll_tree_t *cached_bmtree;
int cached_bmtree_root;
/* binomial tree */
ompi_coll_tree_t *cached_in_order_bmtree;
int cached_in_order_bmtree_root;
/* chained tree (fanout followed by pipelines) */
ompi_coll_tree_t *cached_chain;
int cached_chain_root;
int cached_chain_fanout;
/* pipeline */
ompi_coll_tree_t *cached_pipeline;
int cached_pipeline_root;
/* in-order binary tree (root of the in-order binary tree is rank 0) */
ompi_coll_tree_t *cached_in_order_bintree;
};
typedef struct mca_coll_base_comm_t mca_coll_base_comm_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t);
static inline void ompi_coll_base_free_reqs(ompi_request_t **reqs, int count)
{
int i;
for (i = 0; i < count; ++i)
ompi_request_free(&reqs[i]);
}
/**
* Return the array of requests on the data. If the array was not initialized
* or if it's size was too small, allocate it to fit the requested size.
*/
ompi_request_t** coll_base_comm_get_reqs(mca_coll_base_comm_t* data, int nreqs);
#endif /* MCA_COLL_BASE_EXPORT_H */

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,30 +30,14 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* gather algorithm variables */
static int coll_tuned_gather_algorithm_count = 3;
static int coll_tuned_gather_forced_algorithm = 0;
static int coll_tuned_gather_segment_size = 0;
static int coll_tuned_gather_tree_fanout;
static int coll_tuned_gather_chain_fanout;
/* valid values for coll_tuned_gather_forced_algorithm */
static mca_base_var_enum_value_t gather_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "binomial"},
{3, "linear_sync"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain,
* gather_intra_pipeline, segmentation? */
int
ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
ompi_coll_base_gather_intra_binomial(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -65,19 +49,19 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
char *ptmp = NULL, *tempbuf = NULL;
ompi_coll_tree_t* bmtree;
MPI_Status status;
MPI_Aint sextent, slb, strue_lb, strue_extent;
MPI_Aint sextent, slb, strue_lb, strue_extent;
MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_binomial rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_binomial rank %d", rank));
/* create the binomial tree */
COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
bmtree = data->cached_in_order_bmtree;
ompi_datatype_get_extent(sdtype, &slb, &sextent);
@ -112,7 +96,7 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
} else {
/* copy from rbuf to temp buffer */
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
(char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
@ -157,8 +141,8 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
mycount = size - vkid;
mycount *= rcount;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_binomial rank %d recv %d mycount = %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d",
rank, bmtree->tree_next[i], mycount));
err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype,
@ -172,8 +156,8 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
if (rank != root) {
/* all nodes except root send to parents */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_binomial rank %d send %d count %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n",
rank, bmtree->tree_prev, total_recv));
err = MCA_PML_CALL(send(ptmp, total_recv, sdtype,
@ -207,7 +191,7 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
if (NULL != tempbuf)
free(tempbuf);
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
@ -220,25 +204,25 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
ompi_coll_base_gather_intra_linear_sync(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int first_segment_size)
{
int i, ret, line, rank, size, first_segment_count;
ompi_request_t **reqs = NULL;
MPI_Aint extent, lb;
size_t typelng;
ompi_request_t **reqs = NULL;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));
if (rank != root) {
/* Non-root processes:
@ -250,10 +234,10 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
ompi_datatype_type_size(sdtype, &typelng);
ompi_datatype_get_extent(sdtype, &lb, &extent);
first_segment_count = scount;
COLL_TUNED_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
COLL_BASE_COMPUTED_SEGCOUNT( (size_t) first_segment_size, typelng,
first_segment_count );
ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root,
ret = MCA_PML_CALL(recv(sbuf, 0, MPI_BYTE, root,
MCA_COLL_BASE_TAG_GATHER,
comm, MPI_STATUS_IGNORE));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
@ -263,15 +247,15 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
MCA_PML_BASE_SEND_STANDARD, comm));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count,
(scount - first_segment_count), sdtype,
ret = MCA_PML_CALL(send((char*)sbuf + extent * first_segment_count,
(scount - first_segment_count), sdtype,
root, MCA_COLL_BASE_TAG_GATHER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
} else {
/* Root process,
/* Root process,
- For every non-root node:
- post irecv for the first segment of the message
- send zero byte message to signal node to send the message
@ -284,20 +268,20 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
ompi_request_t *first_segment_req;
reqs = (ompi_request_t**) calloc(size, sizeof(ompi_request_t*));
if (NULL == reqs) { ret = -1; line = __LINE__; goto error_hndl; }
ompi_datatype_type_size(rdtype, &typelng);
ompi_datatype_get_extent(rdtype, &lb, &extent);
first_segment_count = rcount;
COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
COLL_BASE_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
first_segment_count );
ptmp = (char *) rbuf;
for (i = 0; i < size; ++i) {
if (i == rank) {
if (i == rank) {
/* skip myself */
reqs[i] = MPI_REQUEST_NULL;
continue;
}
reqs[i] = MPI_REQUEST_NULL;
continue;
}
/* irecv for the first segment from i */
ptmp = (char*)rbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * extent;
@ -305,7 +289,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
MCA_COLL_BASE_TAG_GATHER, comm,
&first_segment_req));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* send sync message */
ret = MCA_PML_CALL(send(rbuf, 0, MPI_BYTE, i,
MCA_COLL_BASE_TAG_GATHER,
@ -314,7 +298,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
/* irecv for the second segment */
ptmp = (char*)rbuf + ((ptrdiff_t)i * (ptrdiff_t)rcount + first_segment_count) * extent;
ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count),
ret = MCA_PML_CALL(irecv(ptmp, (rcount - first_segment_count),
rdtype, i, MCA_COLL_BASE_TAG_GATHER, comm,
&reqs[i]));
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
@ -327,11 +311,11 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
/* copy local data if necessary */
if (MPI_IN_PLACE != sbuf) {
ret = ompi_datatype_sndrcv(sbuf, scount, sdtype,
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
rcount, rdtype);
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
/* wait all second segments to complete */
ret = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE);
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
@ -346,8 +330,8 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
if (NULL != reqs) {
free(reqs);
}
OPAL_OUTPUT (( ompi_coll_tuned_stream,
"ERROR_HNDL: node %d file %s line %d error %d\n",
OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
"ERROR_HNDL: node %d file %s line %d error %d\n",
rank, __FILE__, line, ret ));
return ret;
}
@ -355,13 +339,13 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
* have to duplicate code.
* JPG following the examples from other coll_tuned implementations. Dec06.
* JPG following the examples from other coll_base implementations. Dec06.
*/
/* copied function (with appropriate renaming) starts here */
@ -373,7 +357,7 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
ompi_coll_base_gather_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
@ -389,8 +373,8 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
rank = ompi_comm_rank(comm);
/* Everyone but root sends data and returns. */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_basic_linear rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_gather_intra_basic_linear rank %d", rank));
if (rank != root) {
return MCA_PML_CALL(send(sbuf, scount, sdtype, root,
@ -427,164 +411,3 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[GATHER] = coll_tuned_gather_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_count",
"Number of gather algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_gather_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_gather_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_gather_algorithms", gather_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm",
"Which gather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 linear with synchronization.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_gather_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_segmentsize",
"Segment size in bytes used by default for gather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_segment_size);
coll_tuned_gather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_tree_fanout",
"Fanout for n-tree used for gather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_tree_fanout);
coll_tuned_gather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_chain_fanout",
"Fanout for chains used for gather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_chain_fanout);
return (MPI_SUCCESS);
}
int
ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_forced selected algorithm %d",
data->user_forced[GATHER].algorithm));
switch (data->user_forced[GATHER].algorithm) {
case (0):
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (3):
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
data->user_forced[GATHER].segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[GATHER].algorithm,
ompi_coll_tuned_forced_max_algorithms[GATHER]));
return (MPI_ERR_ARG);
} /* switch */
}
int
ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (3):
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
segsize);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm,
ompi_coll_tuned_forced_max_algorithms[GATHER]));
return (MPI_ERR_ARG);
} /* switch */
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -32,37 +32,21 @@
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
/* reduce_scatter algorithm variables */
static int coll_tuned_reduce_scatter_algorithm_count = 2;
static int coll_tuned_reduce_scatter_forced_algorithm = 0;
static int coll_tuned_reduce_scatter_segment_size = 0;
static int coll_tuned_reduce_scatter_tree_fanout;
static int coll_tuned_reduce_scatter_chain_fanout;
/* valid values for coll_tuned_reduce_scatter_forced_algorithm */
static mca_base_var_enum_value_t reduce_scatter_algorithms[] = {
{0, "ignore"},
{1, "non-overlapping"},
{2, "recursive_halfing"},
{3, "ring"},
{0, NULL}
};
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
/*******************************************************************************
* ompi_coll_tuned_reduce_scatter_intra_nonoverlapping
* ompi_coll_base_reduce_scatter_intra_nonoverlapping
*
* This function just calls a reduce to rank 0, followed by an
* This function just calls a reduce to rank 0, followed by an
* appropriate scatterv call.
*/
int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
int ompi_coll_base_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
mca_coll_base_module_t *module)
{
int err, i, rank, size, total_count, *displs = NULL;
const int root = 0;
@ -71,7 +55,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_nonoverlapping, rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_nonoverlapping, rank %d", rank));
for (i = 0, total_count = 0; i < size; i++) { total_count += rcounts[i]; }
@ -80,7 +64,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
if (MPI_IN_PLACE == sbuf) {
/* rbuf on root (0) is big enough to hold whole data */
if (root == rank) {
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count,
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count,
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
} else {
err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count,
@ -91,13 +75,13 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
/* We must allocate temporary receive buffer on root to ensure that
rbuf is big enough */
ptrdiff_t lb, extent, tlb, textent;
ompi_datatype_get_extent(dtype, &lb, &extent);
ompi_datatype_get_true_extent(dtype, &tlb, &textent);
tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent);
tmprbuf = tmprbuf_free - lb;
}
}
err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count,
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
}
@ -105,7 +89,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
if (NULL != tmprbuf_free) free(tmprbuf_free);
return err;
}
displs = (int*) malloc(size * sizeof(int));
displs[0] = 0;
for (i = 1; i < size; i++) {
@ -122,7 +106,7 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
/*
* Recursive-halving function is (*mostly*) copied from the BASIC coll module.
* I have removed the part which handles "large" message sizes
* I have removed the part which handles "large" message sizes
* (non-overlapping version of reduce_Scatter).
*/
@ -131,15 +115,15 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
/*
* reduce_scatter_intra_basic_recursivehalving
*
* Function: - reduce scatter implementation using recursive-halving
* Function: - reduce scatter implementation using recursive-halving
* algorithm
* Accepts: - same as MPI_Reduce_scatter()
* Returns: - MPI_SUCCESS or error code
* Limitation: - Works only for commutative operations.
*/
int
ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
void *rbuf,
ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
void *rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
@ -151,12 +135,12 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
char *recv_buf = NULL, *recv_buf_free = NULL;
char *result_buf = NULL, *result_buf_free = NULL;
/* Initialize */
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_basic_recursivehalving, rank %d", rank));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:reduce_scatter_intra_basic_recursivehalving, rank %d", rank));
/* Find displacements and the like */
disps = (int*) malloc(sizeof(int) * size);
@ -191,43 +175,43 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
err = OMPI_ERR_OUT_OF_RESOURCE;
goto cleanup;
}
/* allocate temporary buffer for results */
result_buf_free = (char*) malloc(buf_size);
result_buf = result_buf_free - true_lb;
/* copy local buffer into the temporary results */
err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype);
if (OMPI_SUCCESS != err) goto cleanup;
/* figure out power of two mapping: grow until larger than
comm size, then go back one, to get the largest power of
two less than comm size */
tmp_size = opal_next_poweroftwo (size);
tmp_size = opal_next_poweroftwo (size);
tmp_size >>= 1;
remain = size - tmp_size;
/* If comm size is not a power of two, have the first "remain"
procs with an even rank send to rank + 1, leaving a power of
two procs to do the rest of the algorithm */
if (rank < 2 * remain) {
if ((rank & 1) == 0) {
err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
MCA_PML_BASE_SEND_STANDARD,
comm));
if (OMPI_SUCCESS != err) goto cleanup;
/* we don't participate from here on out */
tmp_rank = -1;
} else {
err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
comm, MPI_STATUS_IGNORE));
/* integrate their results into our temp results */
ompi_op_reduce(op, recv_buf, result_buf, count, dtype);
/* adjust rank to be the bottom "remain" ranks */
tmp_rank = rank / 2;
}
@ -236,13 +220,13 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
remain" ranks dropped out */
tmp_rank = rank - remain;
}
/* For ranks not kicked out by the above code, perform the
recursive halving */
if (tmp_rank >= 0) {
int *tmp_disps = NULL, *tmp_rcounts = NULL;
int mask, send_index, recv_index, last_index;
/* recalculate disps and rcounts to account for the
special "remainder" processes that are no longer doing
anything */
@ -317,11 +301,11 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
free(tmp_rcounts);
free(tmp_disps);
goto cleanup;
}
}
}
if (send_count > 0) {
err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent,
send_count, dtype, peer,
send_count, dtype, peer,
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
MCA_PML_BASE_SEND_STANDARD,
comm));
@ -329,7 +313,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
free(tmp_rcounts);
free(tmp_disps);
goto cleanup;
}
}
}
/* if we received something on this step, push it into
@ -340,10 +324,10 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
free(tmp_rcounts);
free(tmp_disps);
goto cleanup;
}
}
ompi_op_reduce(op,
recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
ompi_op_reduce(op,
recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
recv_count, dtype);
}
@ -357,13 +341,13 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
/* copy local results from results buffer into real receive buffer */
if (0 != rcounts[rank]) {
err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
rcounts[rank], dtype,
rcounts[rank], dtype,
rbuf, rcounts[rank], dtype);
if (OMPI_SUCCESS != err) {
free(tmp_rcounts);
free(tmp_disps);
goto cleanup;
}
}
}
free(tmp_rcounts);
@ -389,7 +373,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
comm));
if (OMPI_SUCCESS != err) goto cleanup;
}
}
}
}
cleanup:
@ -404,18 +388,18 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
/*
* ompi_coll_tuned_reduce_scatter_intra_ring
* ompi_coll_base_reduce_scatter_intra_ring
*
* Function: Ring algorithm for reduce_scatter operation
* Accepts: Same as MPI_Reduce_scatter()
* Returns: MPI_SUCCESS or error code
*
* Description: Implements ring algorithm for reduce_scatter:
* the block sizes defined in rcounts are exchanged and
* Description: Implements ring algorithm for reduce_scatter:
* the block sizes defined in rcounts are exchanged and
8 updated until they reach proper destination.
* Algorithm requires 2 * max(rcounts) extra buffering
*
* Limitations: The algorithm DOES NOT preserve order of operations so it
* Limitations: The algorithm DOES NOT preserve order of operations so it
* can be used only for commutative operations.
* Example on 5 nodes:
* Initial state
@ -427,7 +411,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
* [04] -> [14] [24] [34] [44]
*
* COMPUTATION PHASE
* Step 0: rank r sends block (r-1) to rank (r+1) and
* Step 0: rank r sends block (r-1) to rank (r+1) and
* receives block (r+1) from rank (r-1) [with wraparound].
* # 0 1 2 3 4
* [00] [10] [10+20] -> [30] [40]
@ -435,12 +419,12 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
* -> [02] [12] [22] [32] [32+42] -->..
* [43+03] -> [13] [23] [33] [43]
* [04] [04+14] -> [24] [34] [44]
*
*
* Step 1:
* # 0 1 2 3 4
* [00] [10] [10+20] [10+20+30] -> [40]
* -> [01] [11] [21] [21+31] [21+31+41] ->
* [32+42+02] -> [12] [22] [32] [32+42]
* [32+42+02] -> [12] [22] [32] [32+42]
* [03] [43+03+13] -> [23] [33] [43]
* [04] [04+14] [04+14+24] -> [34] [44]
*
@ -448,7 +432,7 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
* # 0 1 2 3 4
* -> [00] [10] [10+20] [10+20+30] [10+20+30+40] ->
* [21+31+41+01]-> [11] [21] [21+31] [21+31+41]
* [32+42+02] [32+42+02+12]-> [22] [32] [32+42]
* [32+42+02] [32+42+02+12]-> [22] [32] [32+42]
* [03] [43+03+13] [43+03+13+23]-> [33] [43]
* [04] [04+14] [04+14+24] [04+14+24+34] -> [44]
*
@ -456,14 +440,14 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
* # 0 1 2 3 4
* [10+20+30+40+00] [10] [10+20] [10+20+30] [10+20+30+40]
* [21+31+41+01] [21+31+41+01+11] [21] [21+31] [21+31+41]
* [32+42+02] [32+42+02+12] [32+42+02+12+22] [32] [32+42]
* [32+42+02] [32+42+02+12] [32+42+02+12+22] [32] [32+42]
* [03] [43+03+13] [43+03+13+23] [43+03+13+23+33] [43]
* [04] [04+14] [04+14+24] [04+14+24+34] [04+14+24+34+44]
* DONE :)
*
*/
int
ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
int
ompi_coll_base_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
@ -480,11 +464,11 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:reduce_scatter_intra_ring rank %d, size %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:reduce_scatter_intra_ring rank %d, size %d",
rank, size));
/* Determine the maximum number of elements per node,
/* Determine the maximum number of elements per node,
corresponding block size, and displacements array.
*/
displs = (int*) malloc(size * sizeof(int));
@ -492,16 +476,16 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
displs[0] = 0;
total_count = rcounts[0];
max_block_count = rcounts[0];
for (i = 1; i < size; i++) {
for (i = 1; i < size; i++) {
displs[i] = total_count;
total_count += rcounts[i];
if (max_block_count < rcounts[i]) max_block_count = rcounts[i];
}
/* Special case for size == 1 */
if (1 == size) {
if (MPI_IN_PLACE != sbuf) {
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
(char*)rbuf, (char*)sbuf);
if (ret < 0) { line = __LINE__; goto error_hndl; }
}
@ -541,13 +525,13 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
sbuf = rbuf;
}
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
accumbuf, (char*)sbuf);
if (ret < 0) { line = __LINE__; goto error_hndl; }
/* Computation loop */
/*
/*
For each of the remote nodes:
- post irecv for block (r-2) from (r-1) with wrap around
- send block (r-1) to (r+1)
@ -568,7 +552,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
inbi = 0;
/* Initialize first receive from the neighbor on the left */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
&reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
tmpsend = accumbuf + (ptrdiff_t)displs[recv_from] * extent;
@ -579,25 +563,25 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
for (k = 2; k < size; k++) {
const int prevblock = (rank + size - k) % size;
inbi = inbi ^ 0x1;
/* Post irecv for the current block */
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
&reqs[inbi]));
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Wait on previous block to arrive */
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
/* Apply operation on previous block: result goes to rbuf
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
*/
tmprecv = accumbuf + (ptrdiff_t)displs[prevblock] * extent;
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, rcounts[prevblock], dtype);
/* send previous block to send_to */
ret = MCA_PML_CALL(send(tmprecv, rcounts[prevblock], dtype, send_to,
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
@ -613,7 +597,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
rbuf[rank] = inbuf[inbi] (op) rbuf[rank] */
tmprecv = accumbuf + (ptrdiff_t)displs[rank] * extent;
ompi_op_reduce(op, inbuf[inbi], tmprecv, rcounts[rank], dtype);
/* Copy result from tmprecv to rbuf */
ret = ompi_datatype_copy_content_same_ddt(dtype, rcounts[rank], (char *)rbuf, tmprecv);
if (ret < 0) { line = __LINE__; goto error_hndl; }
@ -626,7 +610,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
return MPI_SUCCESS;
error_hndl:
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
__FILE__, line, rank, ret));
if (NULL != displs) free(displs);
if (NULL != accumbuf_free) free(accumbuf_free);
@ -634,139 +618,3 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
if (NULL != inbuf_free[1]) free(inbuf_free[1]);
return ret;
}
/**
* The following are used by dynamic and forced rules
*
* publish details of each algorithm and if its forced/fixed/locked in
* as you add methods/algorithms you must update this and the query/map routines
*
* this routine is called by the component only
* this makes sure that the mca parameters are set to their initial values and
* perms module does not call this they call the forced_getvalues routine
* instead
*/
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER] = coll_tuned_reduce_scatter_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_count",
"Number of reduce_scatter algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_reduce_scatter_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_reduce_scatter_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_reduce_scatter_algorithms", reduce_scatter_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm",
"Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_reduce_scatter_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_segmentsize",
"Segment size in bytes used by default for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_segment_size);
coll_tuned_reduce_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_tree_fanout",
"Fanout for n-tree used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_tree_fanout);
coll_tuned_reduce_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_chain_fanout",
"Fanout for chains used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_reduce_scatter_intra_do_forced(void *sbuf, void* rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced selected algorithm %d",
data->user_forced[REDUCESCATTER].algorithm));
switch (data->user_forced[REDUCESCATTER].algorithm) {
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
dtype, op, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
return (MPI_ERR_ARG);
} /* switch */
}
int ompi_coll_tuned_reduce_scatter_intra_do_this(void *sbuf, void* rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
dtype, op, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
return (MPI_ERR_ARG);
} /* switch */
}

256
ompi/mca/coll/base/coll_base_scatter.c Обычный файл
Просмотреть файл

@ -0,0 +1,256 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
#include "coll_base_util.h"
int
ompi_coll_base_scatter_intra_binomial(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int line = -1, i, rank, vrank, size, total_send = 0, err;
char *ptmp, *tempbuf = NULL;
ompi_coll_tree_t* bmtree;
MPI_Status status;
MPI_Aint sextent, slb, strue_lb, strue_extent;
MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
mca_coll_base_comm_t *data = base_module->base_data;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"ompi_coll_base_scatter_intra_binomial rank %d", rank));
/* create the binomial tree */
COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root );
bmtree = data->cached_in_order_bmtree;
ompi_datatype_get_extent(sdtype, &slb, &sextent);
ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent);
ompi_datatype_get_extent(rdtype, &rlb, &rextent);
ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent);
vrank = (rank - root + size) % size;
ptmp = (char *) rbuf; /* by default suppose leaf nodes, just use rbuf */
if (rank == root) {
if (0 == root) {
/* root on 0, just use the send buffer */
ptmp = (char *) sbuf;
if (rbuf != MPI_IN_PLACE) {
/* local copy to rbuf */
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
rbuf, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
} else {
/* root is not on 0, allocate temp buffer for send */
tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent);
if (NULL == tempbuf) {
err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
}
ptmp = tempbuf - strue_lb;
/* and rotate data so they will eventually in the right place */
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root),
ptmp, (char *) sbuf + sextent * (ptrdiff_t)root * (ptrdiff_t)scount);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)root,
ptmp + sextent * (ptrdiff_t)scount * (ptrdiff_t)(size - root), (char *)sbuf);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
if (rbuf != MPI_IN_PLACE) {
/* local copy to rbuf */
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
rbuf, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
total_send = scount;
} else if (!(vrank % 2)) {
/* non-root, non-leaf nodes, allocte temp buffer for recv
* the most we need is rcount*size/2 */
tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent);
if (NULL == tempbuf) {
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
}
ptmp = tempbuf - rtrue_lb;
sdtype = rdtype;
scount = rcount;
sextent = rextent;
total_send = scount;
}
if (!(vrank % 2)) {
if (rank != root) {
/* recv from parent on non-root */
err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)rcount * (ptrdiff_t)size, rdtype, bmtree->tree_prev,
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* local copy to rbuf */
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
rbuf, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
/* send to children on all non-leaf */
for (i = 0; i < bmtree->tree_nextsize; i++) {
size_t mycount = 0;
int vkid;
/* figure out how much data I have to send to this child */
vkid = (bmtree->tree_next[i] - root + size) % size;
mycount = vkid - vrank;
if( (int)mycount > (size - vkid) )
mycount = size - vkid;
mycount *= scount;
err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)total_send * sextent, mycount, sdtype,
bmtree->tree_next[i],
MCA_COLL_BASE_TAG_SCATTER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
total_send += mycount;
}
if (NULL != tempbuf)
free(tempbuf);
} else {
/* recv from parent on leaf nodes */
err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, bmtree->tree_prev,
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
return MPI_SUCCESS;
err_hndl:
if (NULL != tempbuf)
free(tempbuf);
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as base/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
* have to duplicate code.
* JPG following the examples from other coll_base implementations. Dec06.
*/
/* copied function (with appropriate renaming) starts here */
/*
* scatter_intra
*
* Function: - basic scatter operation
* Accepts: - same arguments as MPI_Scatter()
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_base_scatter_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, err;
ptrdiff_t lb, incr;
char *ptmp;
/* Initialize */
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
/* If not root, receive data. */
if (rank != root) {
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
MCA_COLL_BASE_TAG_SCATTER,
comm, MPI_STATUS_IGNORE));
return err;
}
/* I am the root, loop sending data. */
err = ompi_datatype_get_extent(sdtype, &lb, &incr);
if (OMPI_SUCCESS != err) {
return OMPI_ERROR;
}
incr *= scount;
for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {
/* simple optimization */
if (i == rank) {
if (MPI_IN_PLACE != rbuf) {
err =
ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
rdtype);
}
} else {
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
MCA_COLL_BASE_TAG_SCATTER,
MCA_PML_BASE_SEND_STANDARD, comm));
}
if (MPI_SUCCESS != err) {
return err;
}
}
/* All done */
return MPI_SUCCESS;
}
/* copied function (with appropriate renaming) ends here */

Просмотреть файл

@ -2,19 +2,19 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -25,8 +25,8 @@
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "coll_base_topo.h"
/*
* Some static helpers.
@ -75,36 +75,36 @@ static int calculate_num_nodes_up_to_level( int fanout, int level )
*/
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_tree( int fanout,
ompi_coll_base_topo_build_tree( int fanout,
struct ompi_communicator_t* comm,
int root )
{
int rank, size, schild, sparent, shiftedrank, i;
int level; /* location of my rank in the tree structure of size */
int delta; /* number of nodes on my level */
int slimit; /* total number of nodes on levels above me */
int slimit; /* total number of nodes on levels above me */
ompi_coll_tree_t* tree;
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree Building fo %d rt %d", fanout, root));
if (fanout<1) {
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree invalid fanout %d", fanout));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree invalid fanout %d", fanout));
return NULL;
}
if (fanout>MAXTREEFANOUT) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT));
return NULL;
}
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
if (!tree) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree PANIC::out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree PANIC::out of memory"));
return NULL;
}
@ -115,8 +115,8 @@ ompi_coll_tuned_topo_build_tree( int fanout,
* Set root
*/
tree->tree_root = root;
/*
/*
* Initialize tree
*/
tree->tree_fanout = fanout;
@ -132,11 +132,11 @@ ompi_coll_tuned_topo_build_tree( int fanout,
if( size < 2 ) {
return tree;
}
/*
* Shift all ranks by root, so that the algorithm can be
* Shift all ranks by root, so that the algorithm can be
* designed as if root would be always 0
* shiftedrank should be used in calculating distances
* shiftedrank should be used in calculating distances
* and position in tree
*/
shiftedrank = rank - root;
@ -158,7 +158,7 @@ ompi_coll_tuned_topo_build_tree( int fanout,
break;
}
}
/* find my parent */
slimit = calculate_num_nodes_up_to_level( fanout, level );
sparent = shiftedrank;
@ -170,12 +170,12 @@ ompi_coll_tuned_topo_build_tree( int fanout,
}
}
tree->tree_prev = (sparent+root)%size;
return tree;
}
/*
* Constructs in-order binary tree which can be used for non-commutative reduce
* Constructs in-order binary tree which can be used for non-commutative reduce
* operations.
* Root of this tree is always rank (size-1) and fanout is 2.
* Here are some of the examples of this tree:
@ -189,28 +189,28 @@ ompi_coll_tuned_topo_build_tree( int fanout,
* 4 0
*/
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
{
int rank, size, myrank, rightsize, delta, parent, lchild, rchild;
ompi_coll_tree_t* tree;
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
if (!tree) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:topo_build_tree PANIC::out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:topo_build_tree PANIC::out of memory"));
return NULL;
}
tree->tree_root = MPI_UNDEFINED;
tree->tree_nextsize = MPI_UNDEFINED;
/*
/*
* Initialize tree
*/
tree->tree_fanout = 2;
@ -220,11 +220,11 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
tree->tree_nextsize = 0;
tree->tree_next[0] = -1;
tree->tree_next[1] = -1;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:topo_build_in_order_tree Building fo %d rt %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:topo_build_in_order_tree Building fo %d rt %d",
tree->tree_fanout, tree->tree_root));
/*
/*
* Build the tree
*/
myrank = rank;
@ -240,18 +240,18 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
rchild = -1;
if (size - 1 > 0) {
lchild = parent - 1;
if (lchild > 0) {
if (lchild > 0) {
rchild = rightsize - 1;
}
}
/* The following cases are possible: myrank can be
/* The following cases are possible: myrank can be
- a parent,
- belong to the left subtree, or
- belong to the right subtee
Each of the cases need to be handled differently.
*/
if (myrank == parent) {
/* I am the parent:
- compute real ranks of my children, and exit the loop. */
@ -262,7 +262,7 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
if (myrank > rchild) {
/* I belong to the left subtree:
- If I am the left child, compute real rank of my parent
- Iterate down through tree:
- Iterate down through tree:
compute new size, shift ranks down, and update delta.
*/
if (myrank == lchild) {
@ -276,8 +276,8 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
} else {
/* I belong to the right subtree:
- If I am the right child, compute real rank of my parent
- Iterate down through tree:
compute new size and parent,
- Iterate down through tree:
compute new size and parent,
but the delta and rank do not need to change.
*/
if (myrank == rchild) {
@ -287,14 +287,14 @@ ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
parent = rchild;
}
}
if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
return tree;
}
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree )
{
ompi_coll_tree_t *ptr;
@ -311,7 +311,7 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
}
/*
*
*
* Here are some of the examples of this tree:
* size == 2 size = 4 size = 8
* 0 0 0
@ -323,16 +323,16 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
* 7
*/
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm,
int root )
{
int childs = 0, rank, size, mask = 1, index, remote, i;
ompi_coll_tree_t *bmtree;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree rt %d", root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree rt %d", root));
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
@ -341,7 +341,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
if (!bmtree) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
return NULL;
}
@ -372,7 +372,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
remote += root;
if( remote >= size ) remote -= size;
if (childs==MAXTREEFANOUT) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs));
free(bmtree);
return NULL;
}
@ -388,7 +388,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
/*
* Constructs in-order binomial tree which can be used for gather/scatter
* operations.
*
*
* Here are some of the examples of this tree:
* size == 2 size = 4 size = 8
* 0 0 0
@ -400,16 +400,16 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
* 7
*/
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
int root )
{
int childs = 0, rank, vrank, size, mask = 1, remote, i;
ompi_coll_tree_t *bmtree;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_in_order_bmtree rt %d", root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_in_order_bmtree rt %d", root));
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
@ -418,7 +418,7 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
if (!bmtree) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
return NULL;
}
@ -442,10 +442,10 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
bmtree->tree_next[childs] = (remote + root) % size;
childs++;
if (childs==MAXTREEFANOUT) {
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d",
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"coll:base:topo:build_bmtree max fanout incorrect %d needed %d",
MAXTREEFANOUT, childs));
free (bmtree);
free(bmtree);
return NULL;
}
}
@ -459,36 +459,36 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_chain( int fanout,
ompi_coll_base_topo_build_chain( int fanout,
struct ompi_communicator_t* comm,
int root )
{
int i, maxchainlen, mark, head, len, rank, size, srank /* shifted rank */;
ompi_coll_tree_t *chain;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain fo %d rt %d", fanout, root));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain fo %d rt %d", fanout, root));
/*
* Get size and rank of the process in this communicator
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
if( fanout < 1 ) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!"));
fanout = 1;
}
if (fanout>MAXTREEFANOUT) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT));
fanout = MAXTREEFANOUT;
}
/*
* Allocate space for topology arrays if needed
* Allocate space for topology arrays if needed
*/
chain = (ompi_coll_tree_t*)malloc( sizeof(ompi_coll_tree_t) );
if (!chain) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain PANIC out of memory"));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain PANIC out of memory"));
fflush(stdout);
return NULL;
}
@ -496,17 +496,17 @@ ompi_coll_tuned_topo_build_chain( int fanout,
chain->tree_nextsize = -1;
for(i=0;i<fanout;i++) chain->tree_next[i] = -1;
/*
/*
* Set root & numchain
*/
chain->tree_root = root;
if( (size - 1) < fanout ) {
if( (size - 1) < fanout ) {
chain->tree_nextsize = size-1;
fanout = size-1;
} else {
chain->tree_nextsize = fanout;
}
/*
* Shift ranks
*/
@ -577,7 +577,7 @@ ompi_coll_tuned_topo_build_chain( int fanout,
chain->tree_nextsize = 1;
} else {
chain->tree_next[0] = -1;
chain->tree_nextsize = 0;
chain->tree_nextsize = 0;
}
}
chain->tree_prev = (chain->tree_prev+root)%size;
@ -586,7 +586,7 @@ ompi_coll_tuned_topo_build_chain( int fanout,
}
} else {
/*
* Unshift values
* Unshift values
*/
chain->tree_prev = -1;
chain->tree_next[0] = (root+1)%size;
@ -603,17 +603,18 @@ ompi_coll_tuned_topo_build_chain( int fanout,
return chain;
}
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
{
int i;
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo:topo_dump_tree %1d tree root %d"
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo:topo_dump_tree %1d tree root %d"
" fanout %d BM %1d nextsize %d prev %d",
rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout,
tree->tree_nextsize, tree->tree_prev));
if( tree->tree_nextsize ) {
for( i = 0; i < tree->tree_nextsize; i++ )
OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d", i, tree->tree_next[i]));
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"[%1d] %d", i, tree->tree_next[i]));
}
return (0);
}

Просмотреть файл

@ -2,22 +2,22 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED
#define MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED
#ifndef MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED
#define MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED
#include "ompi_config.h"
@ -35,29 +35,28 @@ typedef struct ompi_coll_tree_t {
} ompi_coll_tree_t;
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_tree( int fanout,
ompi_coll_base_topo_build_tree( int fanout,
struct ompi_communicator_t* com,
int root );
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm );
ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm );
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm,
int root );
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
int root );
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_chain( int fanout,
ompi_coll_base_topo_build_chain( int fanout,
struct ompi_communicator_t* com,
int root );
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree );
int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree );
/* debugging stuff, will be removed later */
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
END_C_DECLS
#endif /* MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED */
#endif /* MCA_COLL_BASE_TOPO_H_HAS_BEEN_INCLUDED */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -19,17 +19,17 @@
*/
#include "ompi_config.h"
#include "coll_tuned.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned_util.h"
#include "coll_base_util.h"
int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
ompi_datatype_t* sdatatype,
int dest, int stag,
void* recvbuf, size_t rcount,
@ -91,14 +91,14 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
*status = statuses[err_index];
}
err = statuses[err_index].MPI_ERROR;
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred in the %s"
" stage of ompi_coll_tuned_sendrecv_zero\n",
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
" stage of ompi_coll_base_sendrecv_zero\n",
__FILE__, line, err, (0 == err_index ? "receive" : "send")));
} else {
/* Error discovered during the posting of the irecv or isend,
* and no status is available.
*/
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n",
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
__FILE__, line, err));
if (MPI_STATUS_IGNORE != status) {
status->MPI_ERROR = err;

Просмотреть файл

@ -2,24 +2,24 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_UTIL_EXPORT_H
#define MCA_COLL_TUNED_UTIL_EXPORT_H
#ifndef MCA_COLL_BASE_UTIL_EXPORT_H
#define MCA_COLL_BASE_UTIL_EXPORT_H
#include "ompi_config.h"
@ -36,10 +36,10 @@ BEGIN_C_DECLS
* If one of the communications results in a zero-byte message the
* communication is ignored, and no message will cross to the peer.
*/
int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
ompi_datatype_t* sdatatype,
int dest, int stag,
void* recvbuf, size_t rcount,
void* recvbuf, size_t rcount,
ompi_datatype_t* rdatatype,
int source, int rtag,
struct ompi_communicator_t* comm,
@ -53,24 +53,22 @@ int ompi_coll_tuned_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
* communications.
*/
static inline int
ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype,
ompi_coll_base_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdatatype,
int dest, int stag,
void* recvbuf, size_t rcount, ompi_datatype_t* rdatatype,
int source, int rtag,
int source, int rtag,
struct ompi_communicator_t* comm,
ompi_status_public_t* status, int myid )
{
if ((dest == source) && (source == myid)) {
return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype,
return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype,
recvbuf, (int32_t) rcount, rdatatype);
}
return ompi_coll_tuned_sendrecv_nonzero_actual (sendbuf, scount, sdatatype,
dest, stag,
return ompi_coll_base_sendrecv_nonzero_actual (sendbuf, scount, sdatatype,
dest, stag,
recvbuf, rcount, rdatatype,
source, rtag, comm, status);
}
END_C_DECLS
#endif /* MCA_COLL_TUNED_UTIL_EXPORT_H */
#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -31,6 +31,7 @@
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
BEGIN_C_DECLS
@ -52,12 +53,6 @@ BEGIN_C_DECLS
int mca_coll_basic_module_enable(mca_coll_base_module_t *module,
struct ompi_communicator_t *comm);
int mca_coll_basic_allgather_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_allgather_inter(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
@ -65,13 +60,6 @@ BEGIN_C_DECLS
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_allgatherv_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *disps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_allgatherv_inter(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
@ -91,12 +79,6 @@ BEGIN_C_DECLS
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_alltoall_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_alltoall_inter(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
@ -104,14 +86,6 @@ BEGIN_C_DECLS
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_alltoallv_intra(void *sbuf, int *scounts,
int *sdisps,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_alltoallv_inter(void *sbuf, int *scounts,
int *sdisps,
struct ompi_datatype_t *sdtype,
@ -138,21 +112,12 @@ BEGIN_C_DECLS
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_barrier_intra_lin(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_barrier_inter_lin(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_barrier_intra_log(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_bcast_lin_intra(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_bcast_lin_inter(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
@ -183,13 +148,6 @@ BEGIN_C_DECLS
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_gather_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_gather_inter(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
@ -214,12 +172,6 @@ BEGIN_C_DECLS
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_reduce_lin_intra(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_reduce_lin_inter(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
@ -279,13 +231,6 @@ BEGIN_C_DECLS
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_scatter_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_basic_scatter_inter(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -32,50 +32,6 @@
#include "coll_basic.h"
/*
* allgather_intra
*
* Function: - allgather using other MPI collections
* Accepts: - same as MPI_Allgather()
* Returns: - MPI_SUCCESS or error code
*/
int
mca_coll_basic_allgather_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype, void *rbuf,
int rcount, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int err;
ptrdiff_t lb, extent;
/* Handle MPI_IN_PLACE (see explanantion in reduce.c for how to
allocate temp buffer) -- note that rank 0 can use IN_PLACE
natively, and we can just alias the right position in rbuf
as sbuf and avoid using a temporary buffer if gather is
implemented correctly */
if (MPI_IN_PLACE == sbuf && 0 != ompi_comm_rank(comm)) {
ompi_datatype_get_extent(rdtype, &lb, &extent);
sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount);
sdtype = rdtype;
scount = rcount;
}
/* Gather and broadcast. */
err = comm->c_coll.coll_gather(sbuf, scount, sdtype, rbuf, rcount,
rdtype, 0, comm, comm->c_coll.coll_gather_module);
if (MPI_SUCCESS == err) {
err = comm->c_coll.coll_bcast(rbuf, rcount * ompi_comm_size(comm),
rdtype, 0, comm, comm->c_coll.coll_bcast_module);
}
/* All done */
return err;
}
/*
* allgather_inter
*

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -28,87 +28,6 @@
#include "coll_basic.h"
/*
* allgatherv_intra
*
* Function: - allgatherv using other MPI collectives
* Accepts: - same as MPI_Allgatherv()
* Returns: - MPI_SUCCESS or error code
*/
int
mca_coll_basic_allgatherv_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts, int *disps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, size, rank ;
int err;
MPI_Aint extent;
MPI_Aint lb;
char *send_buf = NULL;
struct ompi_datatype_t *newtype, *send_type;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/*
* We don't have a root process defined. Arbitrarily assign root
* to process with rank 0 (OMPI convention)
*/
if (MPI_IN_PLACE == sbuf) {
ompi_datatype_get_extent(rdtype, &lb, &extent);
send_type = rdtype;
send_buf = (char*)rbuf;
for (i = 0; i < rank; ++i) {
send_buf += (rcounts[i] * extent);
}
} else {
send_buf = (char*)sbuf;
send_type = sdtype;
}
err = comm->c_coll.coll_gatherv(send_buf,
rcounts[rank], send_type,rbuf,
rcounts, disps, rdtype, 0,
comm, comm->c_coll.coll_gatherv_module);
if (MPI_SUCCESS != err) {
return err;
}
/*
* we now have all the data in the root's rbuf. Need to
* broadcast the data out to the other processes
*
* Need to define a datatype that captures the different vectors
* from each process. MPI_TYPE_INDEXED with params
* size,rcount,displs,rdtype,newtype
* should do the trick.
* Use underlying ddt functions to create, and commit the
* new datatype on each process, then broadcast and destroy the
* datatype.
*/
err = ompi_datatype_create_indexed(size,rcounts,disps,rdtype,&newtype);
if (MPI_SUCCESS != err) {
return err;
}
err = ompi_datatype_commit(&newtype);
if(MPI_SUCCESS != err) {
return err;
}
err = comm->c_coll.coll_bcast( rbuf, 1 ,newtype,0,comm,
comm->c_coll.coll_bcast_module);
ompi_datatype_destroy (&newtype);
return err;
}
/*
* allgatherv_inter
*

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -32,224 +32,6 @@
#include "ompi/mca/pml/pml.h"
static int
mca_coll_basic_alltoall_intra_inplace(void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
int i, j, size, rank, err=MPI_SUCCESS;
MPI_Request *preq;
char *tmp_buffer;
size_t max_size;
ptrdiff_t ext;
/* Initialize. */
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/* If only one process, we're done. */
if (1 == size) {
return MPI_SUCCESS;
}
/* Find the largest receive amount */
ompi_datatype_type_extent (rdtype, &ext);
max_size = ext * rcount;
/* Allocate a temporary buffer */
tmp_buffer = calloc (max_size, 1);
if (NULL == tmp_buffer) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* in-place alltoall slow algorithm (but works) */
for (i = 0 ; i < size ; ++i) {
for (j = i+1 ; j < size ; ++j) {
/* Initiate all send/recv to/from others. */
preq = basic_module->mccb_reqs;
if (i == rank) {
/* Copy the data into the temporary buffer */
err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
(char *) rbuf + j * max_size);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Exchange data with the peer */
err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype,
j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
if (MPI_SUCCESS != err) { goto error_hndl; }
err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype,
j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
comm, preq++));
if (MPI_SUCCESS != err) { goto error_hndl; }
} else if (j == rank) {
/* Copy the data into the temporary buffer */
err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
(char *) rbuf + i * max_size);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Exchange data with the peer */
err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype,
i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
if (MPI_SUCCESS != err) { goto error_hndl; }
err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype,
i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
comm, preq++));
if (MPI_SUCCESS != err) { goto error_hndl; }
} else {
continue;
}
/* Wait for the requests to complete */
err = ompi_request_wait_all (2, basic_module->mccb_reqs, MPI_STATUSES_IGNORE);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Free the requests. */
mca_coll_basic_free_reqs(basic_module->mccb_reqs, 2);
}
}
error_hndl:
/* Free the temporary buffer */
free (tmp_buffer);
/* All done */
return err;
}
/*
* alltoall_intra
*
* Function: - MPI_Alltoall
* Accepts: - same as MPI_Alltoall()
* Returns: - MPI_SUCCESS or an MPI error code
*/
int
mca_coll_basic_alltoall_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i;
int rank;
int size;
int err;
int nreqs;
char *psnd;
char *prcv;
MPI_Aint lb;
MPI_Aint sndinc;
MPI_Aint rcvinc;
ompi_request_t **req;
ompi_request_t **sreq;
ompi_request_t **rreq;
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
/* Initialize. */
if (MPI_IN_PLACE == sbuf) {
return mca_coll_basic_alltoall_intra_inplace (rbuf, rcount, rdtype,
comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
err = ompi_datatype_get_extent(sdtype, &lb, &sndinc);
if (OMPI_SUCCESS != err) {
return err;
}
sndinc *= scount;
err = ompi_datatype_get_extent(rdtype, &lb, &rcvinc);
if (OMPI_SUCCESS != err) {
return err;
}
rcvinc *= rcount;
/* simple optimization */
psnd = ((char *) sbuf) + (rank * sndinc);
prcv = ((char *) rbuf) + (rank * rcvinc);
err = ompi_datatype_sndrcv(psnd, scount, sdtype, prcv, rcount, rdtype);
if (MPI_SUCCESS != err) {
return err;
}
/* If only one process, we're done. */
if (1 == size) {
return MPI_SUCCESS;
}
/* Initiate all send/recv to/from others. */
req = rreq = basic_module->mccb_reqs;
sreq = rreq + size - 1;
prcv = (char *) rbuf;
psnd = (char *) sbuf;
/* Post all receives first -- a simple optimization */
for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++rreq, ++nreqs) {
err =
MCA_PML_CALL(irecv_init
(prcv + (i * rcvinc), rcount, rdtype, i,
MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
if (MPI_SUCCESS != err) {
mca_coll_basic_free_reqs(req, nreqs);
return err;
}
}
/* Now post all sends */
for (nreqs = 0, i = (rank + 1) % size; i != rank; i = (i + 1) % size, ++sreq, ++nreqs) {
err =
MCA_PML_CALL(isend_init
(psnd + (i * sndinc), scount, sdtype, i,
MCA_COLL_BASE_TAG_ALLTOALL,
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
if (MPI_SUCCESS != err) {
mca_coll_basic_free_reqs(req, nreqs);
return err;
}
}
nreqs = (size - 1) * 2;
/* Start your engines. This will never return an error. */
MCA_PML_CALL(start(nreqs, req));
/* Wait for them all. If there's an error, note that we don't
* care what the error was -- just that there *was* an error. The
* PML will finish all requests, even if one or more of them fail.
* i.e., by the end of this call, all the requests are free-able.
* So free them anyway -- even if there was an error, and return
* the error after we free everything. */
err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE);
/* Free the reqs */
mca_coll_basic_free_reqs(req, nreqs);
/* All done */
return err;
}
/*
* alltoall_inter
*

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -33,226 +33,6 @@
#include "ompi/mca/pml/pml.h"
static int
mca_coll_basic_alltoallv_intra_inplace(void *rbuf, const int *rcounts, const int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
int i, j, size, rank, err=MPI_SUCCESS;
MPI_Request *preq;
char *tmp_buffer;
size_t max_size;
ptrdiff_t ext;
/* Initialize. */
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/* If only one process, we're done. */
if (1 == size) {
return MPI_SUCCESS;
}
/* Find the largest receive amount */
ompi_datatype_type_extent (rdtype, &ext);
for (i = 0, max_size = 0 ; i < size ; ++i) {
size_t size = ext * rcounts[i];
max_size = size > max_size ? size : max_size;
}
/* Allocate a temporary buffer */
tmp_buffer = calloc (max_size, 1);
if (NULL == tmp_buffer) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* in-place alltoallv slow algorithm (but works) */
for (i = 0 ; i < size ; ++i) {
for (j = i+1 ; j < size ; ++j) {
/* Initiate all send/recv to/from others. */
preq = basic_module->mccb_reqs;
if (i == rank && rcounts[j]) {
/* Copy the data into the temporary buffer */
err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j],
tmp_buffer, (char *) rbuf + rdisps[j] * ext);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Exchange data with the peer */
err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype,
j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
if (MPI_SUCCESS != err) { goto error_hndl; }
err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtype,
j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
comm, preq++));
if (MPI_SUCCESS != err) { goto error_hndl; }
} else if (j == rank && rcounts[i]) {
/* Copy the data into the temporary buffer */
err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i],
tmp_buffer, (char *) rbuf + rdisps[i] * ext);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Exchange data with the peer */
err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype,
i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++));
if (MPI_SUCCESS != err) { goto error_hndl; }
err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtype,
i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
comm, preq++));
if (MPI_SUCCESS != err) { goto error_hndl; }
} else {
continue;
}
/* Wait for the requests to complete */
err = ompi_request_wait_all (2, basic_module->mccb_reqs, MPI_STATUSES_IGNORE);
if (MPI_SUCCESS != err) { goto error_hndl; }
/* Free the requests. */
mca_coll_basic_free_reqs(basic_module->mccb_reqs, 2);
}
}
error_hndl:
/* Free the temporary buffer */
free (tmp_buffer);
/* All done */
return err;
}
/*
* alltoallv_intra
*
* Function: - MPI_Alltoallv
* Accepts: - same as MPI_Alltoallv()
* Returns: - MPI_SUCCESS or an MPI error code
*/
int
mca_coll_basic_alltoallv_intra(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i;
int size;
int rank;
int err;
char *psnd;
char *prcv;
int nreqs;
MPI_Aint sndextent;
MPI_Aint rcvextent;
MPI_Request *preq;
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
/* Initialize. */
if (MPI_IN_PLACE == sbuf) {
return mca_coll_basic_alltoallv_intra_inplace (rbuf, rcounts, rdisps,
rdtype, comm, module);
}
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
ompi_datatype_type_extent(sdtype, &sndextent);
ompi_datatype_type_extent(rdtype, &rcvextent);
/* simple optimization */
psnd = ((char *) sbuf) + (sdisps[rank] * sndextent);
prcv = ((char *) rbuf) + (rdisps[rank] * rcvextent);
if (0 != scounts[rank]) {
err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype,
prcv, rcounts[rank], rdtype);
if (MPI_SUCCESS != err) {
return err;
}
}
/* If only one process, we're done. */
if (1 == size) {
return MPI_SUCCESS;
}
/* Initiate all send/recv to/from others. */
nreqs = 0;
preq = basic_module->mccb_reqs;
/* Post all receives first -- a simple optimization */
for (i = 0; i < size; ++i) {
if (i == rank || 0 == rcounts[i]) {
continue;
}
prcv = ((char *) rbuf) + (rdisps[i] * rcvextent);
err = MCA_PML_CALL(irecv_init(prcv, rcounts[i], rdtype,
i, MCA_COLL_BASE_TAG_ALLTOALLV, comm,
preq++));
++nreqs;
if (MPI_SUCCESS != err) {
mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs);
return err;
}
}
/* Now post all sends */
for (i = 0; i < size; ++i) {
if (i == rank || 0 == scounts[i]) {
continue;
}
psnd = ((char *) sbuf) + (sdisps[i] * sndextent);
err = MCA_PML_CALL(isend_init(psnd, scounts[i], sdtype,
i, MCA_COLL_BASE_TAG_ALLTOALLV,
MCA_PML_BASE_SEND_STANDARD, comm,
preq++));
++nreqs;
if (MPI_SUCCESS != err) {
mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs);
return err;
}
}
/* Start your engines. This will never return an error. */
MCA_PML_CALL(start(nreqs, basic_module->mccb_reqs));
/* Wait for them all. If there's an error, note that we don't care
* what the error was -- just that there *was* an error. The PML
* will finish all requests, even if one or more of them fail.
* i.e., by the end of this call, all the requests are free-able.
* So free them anyway -- even if there was an error, and return the
* error after we free everything. */
err = ompi_request_wait_all(nreqs, basic_module->mccb_reqs,
MPI_STATUSES_IGNORE);
/* Free the requests. */
mca_coll_basic_free_reqs(basic_module->mccb_reqs, nreqs);
/* All done */
return err;
}
/*
* alltoallv_inter
*

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,72 +30,6 @@
#include "coll_basic.h"
/*
* barrier_intra_lin
*
* Function: - barrier using O(N) algorithm
* Accepts: - same as MPI_Barrier()
* Returns: - MPI_SUCCESS or error code
*/
int
mca_coll_basic_barrier_intra_lin(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i;
int err;
int size = ompi_comm_size(comm);
int rank = ompi_comm_rank(comm);
/* All non-root send & receive zero-length message. */
if (rank > 0) {
err =
MCA_PML_CALL(send
(NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err) {
return err;
}
err =
MCA_PML_CALL(recv
(NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER,
comm, MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err) {
return err;
}
}
/* The root collects and broadcasts the messages. */
else {
for (i = 1; i < size; ++i) {
err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
MCA_COLL_BASE_TAG_BARRIER,
comm, MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err) {
return err;
}
}
for (i = 1; i < size; ++i) {
err =
MCA_PML_CALL(send
(NULL, 0, MPI_BYTE, i,
MCA_COLL_BASE_TAG_BARRIER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err) {
return err;
}
}
}
/* All done */
return MPI_SUCCESS;
}
/*
* barrier_intra_log
*

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -30,78 +30,6 @@
#include "opal/util/bit_ops.h"
/*
* bcast_lin_intra
*
* Function: - broadcast using O(N) algorithm
* Accepts: - same arguments as MPI_Bcast()
* Returns: - MPI_SUCCESS or error code
*/
int
mca_coll_basic_bcast_lin_intra(void *buff, int count,
struct ompi_datatype_t *datatype, int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i;
int size;
int rank;
int err;
ompi_request_t **preq;
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
ompi_request_t **reqs = basic_module->mccb_reqs;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/* Non-root receive the data. */
if (rank != root) {
return MCA_PML_CALL(recv(buff, count, datatype, root,
MCA_COLL_BASE_TAG_BCAST, comm,
MPI_STATUS_IGNORE));
}
/* Root sends data to all others. */
for (i = 0, preq = reqs; i < size; ++i) {
if (i == rank) {
continue;
}
err = MCA_PML_CALL(isend_init(buff, count, datatype, i,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,
comm, preq++));
if (MPI_SUCCESS != err) {
return err;
}
}
--i;
/* Start your engines. This will never return an error. */
MCA_PML_CALL(start(i, reqs));
/* Wait for them all. If there's an error, note that we don't
* care what the error was -- just that there *was* an error. The
* PML will finish all requests, even if one or more of them fail.
* i.e., by the end of this call, all the requests are free-able.
* So free them anyway -- even if there was an error, and return
* the error after we free everything. */
err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE);
/* Free the reqs */
mca_coll_basic_free_reqs(reqs, i);
/* All done */
return err;
}
/*
* bcast_log_intra
*

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -27,68 +27,6 @@
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
/*
* gather_intra
*
* Function: - basic gather operation
* Accepts: - same arguments as MPI_Gather()
* Returns: - MPI_SUCCESS or error code
*/
int
mca_coll_basic_gather_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i;
int err;
int rank;
int size;
char *ptmp;
MPI_Aint incr;
MPI_Aint extent;
MPI_Aint lb;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/* Everyone but root sends data and returns. */
if (rank != root) {
return MCA_PML_CALL(send(sbuf, scount, sdtype, root,
MCA_COLL_BASE_TAG_GATHER,
MCA_PML_BASE_SEND_STANDARD, comm));
}
/* I am the root, loop receiving the data. */
ompi_datatype_get_extent(rdtype, &lb, &extent);
incr = extent * rcount;
for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) {
if (i == rank) {
if (MPI_IN_PLACE != sbuf) {
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
ptmp, rcount, rdtype);
} else {
err = MPI_SUCCESS;
}
} else {
err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, i,
MCA_COLL_BASE_TAG_GATHER,
comm, MPI_STATUS_IGNORE));
}
if (MPI_SUCCESS != err) {
return err;
}
}
/* All done */
return MPI_SUCCESS;
}
/*
* gather_inter

Просмотреть файл

@ -3,10 +3,10 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
@ -129,40 +129,40 @@ mca_coll_basic_comm_query(struct ompi_communicator_t *comm,
basic_module->super.coll_scatter = mca_coll_basic_scatter_inter;
basic_module->super.coll_scatterv = mca_coll_basic_scatterv_inter;
} else if (ompi_comm_size(comm) <= mca_coll_basic_crossover) {
basic_module->super.coll_allgather = mca_coll_basic_allgather_intra;
basic_module->super.coll_allgatherv = mca_coll_basic_allgatherv_intra;
basic_module->super.coll_allgather = ompi_coll_base_allgather_intra_basic_linear;
basic_module->super.coll_allgatherv = ompi_coll_base_allgatherv_intra_basic_default;
basic_module->super.coll_allreduce = mca_coll_basic_allreduce_intra;
basic_module->super.coll_alltoall = mca_coll_basic_alltoall_intra;
basic_module->super.coll_alltoallv = mca_coll_basic_alltoallv_intra;
basic_module->super.coll_alltoall = ompi_coll_base_alltoall_intra_basic_linear;
basic_module->super.coll_alltoallv = ompi_coll_base_alltoallv_intra_basic_linear;
basic_module->super.coll_alltoallw = mca_coll_basic_alltoallw_intra;
basic_module->super.coll_barrier = mca_coll_basic_barrier_intra_lin;
basic_module->super.coll_bcast = mca_coll_basic_bcast_lin_intra;
basic_module->super.coll_barrier = ompi_coll_base_barrier_intra_basic_linear;
basic_module->super.coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
basic_module->super.coll_exscan = mca_coll_basic_exscan_intra;
basic_module->super.coll_gather = mca_coll_basic_gather_intra;
basic_module->super.coll_gather = ompi_coll_base_gather_intra_basic_linear;
basic_module->super.coll_gatherv = mca_coll_basic_gatherv_intra;
basic_module->super.coll_reduce = mca_coll_basic_reduce_lin_intra;
basic_module->super.coll_reduce = ompi_coll_base_reduce_intra_basic_linear;
basic_module->super.coll_reduce_scatter_block = mca_coll_basic_reduce_scatter_block_intra;
basic_module->super.coll_reduce_scatter = mca_coll_basic_reduce_scatter_intra;
basic_module->super.coll_scan = mca_coll_basic_scan_intra;
basic_module->super.coll_scatter = mca_coll_basic_scatter_intra;
basic_module->super.coll_scatter = ompi_coll_base_scatter_intra_basic_linear;
basic_module->super.coll_scatterv = mca_coll_basic_scatterv_intra;
} else {
basic_module->super.coll_allgather = mca_coll_basic_allgather_intra;
basic_module->super.coll_allgatherv = mca_coll_basic_allgatherv_intra;
basic_module->super.coll_allgather = ompi_coll_base_allgather_intra_basic_linear;
basic_module->super.coll_allgatherv = ompi_coll_base_allgatherv_intra_basic_default;
basic_module->super.coll_allreduce = mca_coll_basic_allreduce_intra;
basic_module->super.coll_alltoall = mca_coll_basic_alltoall_intra;
basic_module->super.coll_alltoallv = mca_coll_basic_alltoallv_intra;
basic_module->super.coll_alltoall = ompi_coll_base_alltoall_intra_basic_linear;
basic_module->super.coll_alltoallv = ompi_coll_base_alltoallv_intra_basic_linear;
basic_module->super.coll_alltoallw = mca_coll_basic_alltoallw_intra;
basic_module->super.coll_barrier = mca_coll_basic_barrier_intra_log;
basic_module->super.coll_bcast = mca_coll_basic_bcast_log_intra;
basic_module->super.coll_exscan = mca_coll_basic_exscan_intra;
basic_module->super.coll_gather = mca_coll_basic_gather_intra;
basic_module->super.coll_gather = ompi_coll_base_gather_intra_basic_linear;
basic_module->super.coll_gatherv = mca_coll_basic_gatherv_intra;
basic_module->super.coll_reduce = mca_coll_basic_reduce_log_intra;
basic_module->super.coll_reduce_scatter_block = mca_coll_basic_reduce_scatter_block_intra;
basic_module->super.coll_reduce_scatter = mca_coll_basic_reduce_scatter_intra;
basic_module->super.coll_scan = mca_coll_basic_scan_intra;
basic_module->super.coll_scatter = mca_coll_basic_scatter_intra;
basic_module->super.coll_scatter = ompi_coll_base_scatter_intra_basic_linear;
basic_module->super.coll_scatterv = mca_coll_basic_scatterv_intra;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -28,241 +28,6 @@
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
/*
* reduce_lin_intra
*
* Function: - reduction using O(N) algorithm
* Accepts: - same as MPI_Reduce()
* Returns: - MPI_SUCCESS or error code
*/
int
mca_coll_basic_reduce_lin_intra(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, err, size;
ptrdiff_t true_lb, true_extent, lb, extent;
char *free_buffer = NULL;
char *pml_buffer = NULL;
char *inplace_temp = NULL;
char *inbuf;
/* Initialize */
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
/* If not root, send data to the root. */
if (rank != root) {
err = MCA_PML_CALL(send(sbuf, count, dtype, root,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_STANDARD, comm));
return err;
}
/* Root receives and reduces messages. Allocate buffer to receive
* messages. This comment applies to all collectives in this basic
* module where we allocate a temporary buffer. For the next few
* lines of code, it's tremendously complicated how we decided that
* this was the Right Thing to do. Sit back and enjoy. And prepare
* to have your mind warped. :-)
*
* Recall some definitions (I always get these backwards, so I'm
* going to put them here):
*
* extent: the length from the lower bound to the upper bound -- may
* be considerably larger than the buffer required to hold the data
* (or smaller! But it's easiest to think about when it's larger).
*
* true extent: the exact number of bytes required to hold the data
* in the layout pattern in the datatype.
*
* For example, consider the following buffer (just talking about
* true_lb, extent, and true extent -- extrapolate for true_ub:
*
* A B C
* --------------------------------------------------------
* | | |
* --------------------------------------------------------
*
* There are multiple cases:
*
* 1. A is what we give to MPI_Send (and friends), and A is where
* the data starts, and C is where the data ends. In this case:
*
* - extent: C-A
* - true extent: C-A
* - true_lb: 0
*
* A C
* --------------------------------------------------------
* | |
* --------------------------------------------------------
* <=======================extent=========================>
* <======================true extent=====================>
*
* 2. A is what we give to MPI_Send (and friends), B is where the
* data starts, and C is where the data ends. In this case:
*
* - extent: C-A
* - true extent: C-B
* - true_lb: positive
*
* A B C
* --------------------------------------------------------
* | | User buffer |
* --------------------------------------------------------
* <=======================extent=========================>
* <===============true extent=============>
*
* 3. B is what we give to MPI_Send (and friends), A is where the
* data starts, and C is where the data ends. In this case:
*
* - extent: C-A
* - true extent: C-A
* - true_lb: negative
*
* A B C
* --------------------------------------------------------
* | | User buffer |
* --------------------------------------------------------
* <=======================extent=========================>
* <======================true extent=====================>
*
* 4. MPI_BOTTOM is what we give to MPI_Send (and friends), B is
* where the data starts, and C is where the data ends. In this
* case:
*
* - extent: C-MPI_BOTTOM
* - true extent: C-B
* - true_lb: [potentially very large] positive
*
* MPI_BOTTOM B C
* --------------------------------------------------------
* | | User buffer |
* --------------------------------------------------------
* <=======================extent=========================>
* <===============true extent=============>
*
* So in all cases, for a temporary buffer, all we need to malloc()
* is a buffer of size true_extent. We therefore need to know two
* pointer values: what value to give to MPI_Send (and friends) and
* what value to give to free(), because they might not be the same.
*
* Clearly, what we give to free() is exactly what was returned from
* malloc(). That part is easy. :-)
*
* What we give to MPI_Send (and friends) is a bit more complicated.
* Let's take the 4 cases from above:
*
* 1. If A is what we give to MPI_Send and A is where the data
* starts, then clearly we give to MPI_Send what we got back from
* malloc().
*
* 2. If B is what we get back from malloc, but we give A to
* MPI_Send, then the buffer range [A,B) represents "dead space"
* -- no data will be put there. So it's safe to give B-true_lb to
* MPI_Send. More specifically, the true_lb is positive, so B-true_lb is
* actually A.
*
* 3. If A is what we get back from malloc, and B is what we give to
* MPI_Send, then the true_lb is negative, so A-true_lb will actually equal
* B.
*
* 4. Although this seems like the weirdest case, it's actually
* quite similar to case #2 -- the pointer we give to MPI_Send is
* smaller than the pointer we got back from malloc().
*
* Hence, in all cases, we give (return_from_malloc - true_lb) to MPI_Send.
*
* This works fine and dandy if we only have (count==1), which we
* rarely do. ;-) So we really need to allocate (true_extent +
* ((count - 1) * extent)) to get enough space for the rest. This may
* be more than is necessary, but it's ok.
*
* Simple, no? :-)
*
*/
ompi_datatype_get_extent(dtype, &lb, &extent);
ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent);
if (MPI_IN_PLACE == sbuf) {
sbuf = rbuf;
inplace_temp = (char*)malloc(true_extent + (count - 1) * extent);
if (NULL == inplace_temp) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
rbuf = inplace_temp - true_lb;
}
if (size > 1) {
free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
if (NULL == free_buffer) {
if (NULL != inplace_temp) {
free(inplace_temp);
}
return OMPI_ERR_OUT_OF_RESOURCE;
}
pml_buffer = free_buffer - true_lb;
}
/* Initialize the receive buffer. */
if (rank == (size - 1)) {
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf);
} else {
err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1,
MCA_COLL_BASE_TAG_REDUCE, comm,
MPI_STATUS_IGNORE));
}
if (MPI_SUCCESS != err) {
if (NULL != free_buffer) {
free(free_buffer);
}
return err;
}
/* Loop receiving and calling reduction function (C or Fortran). */
for (i = size - 2; i >= 0; --i) {
if (rank == i) {
inbuf = (char*)sbuf;
} else {
err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i,
MCA_COLL_BASE_TAG_REDUCE, comm,
MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err) {
if (NULL != free_buffer) {
free(free_buffer);
}
return err;
}
inbuf = pml_buffer;
}
/* Perform the reduction */
ompi_op_reduce(op, inbuf, rbuf, count, dtype);
}
if (NULL != inplace_temp) {
err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp);
free(inplace_temp);
}
if (NULL != free_buffer) {
free(free_buffer);
}
/* All done */
return MPI_SUCCESS;
}
/*
* reduce_log_intra
@ -339,8 +104,8 @@ mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count,
* operations. */
if (!ompi_op_is_commute(op)) {
return mca_coll_basic_reduce_lin_intra(sbuf, rbuf, count, dtype,
op, root, comm, module);
return ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
op, root, comm, module);
}
/* Some variables */

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -28,73 +28,6 @@
#include "coll_basic.h"
/*
* scatter_intra
*
* Function: - scatter operation
* Accepts: - same arguments as MPI_Scatter()
* Returns: - MPI_SUCCESS or error code
*/
int
mca_coll_basic_scatter_intra(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, err;
char *ptmp;
ptrdiff_t lb, incr;
/* Initialize */
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
/* If not root, receive data. */
if (rank != root) {
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
MCA_COLL_BASE_TAG_SCATTER,
comm, MPI_STATUS_IGNORE));
return err;
}
/* I am the root, loop sending data. */
err = ompi_datatype_get_extent(sdtype, &lb, &incr);
if (OMPI_SUCCESS != err) {
return OMPI_ERROR;
}
incr *= scount;
for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {
/* simple optimization */
if (i == rank) {
if (MPI_IN_PLACE != rbuf) {
err =
ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
rdtype);
}
} else {
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
MCA_COLL_BASE_TAG_SCATTER,
MCA_PML_BASE_SEND_STANDARD, comm));
}
if (MPI_SUCCESS != err) {
return err;
}
}
/* All done */
return MPI_SUCCESS;
}
/*
* scatter_inter
*

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -470,6 +470,9 @@ struct mca_coll_base_module_2_1_0_t {
be used for the given communicator */
mca_coll_base_module_disable_1_1_0_fn_t coll_module_disable;
/** Data storage for all the algorithms defined in the base. Should
not be used by other modules */
struct mca_coll_base_comm_t* base_data;
};
typedef struct mca_coll_base_module_2_1_0_t mca_coll_base_module_2_1_0_t;

Просмотреть файл

@ -2,7 +2,7 @@
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2009 The University of Tennessee and The University
# Copyright (c) 2004-2015 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -19,29 +19,25 @@
sources = \
coll_tuned.h \
coll_tuned_topo.h \
coll_tuned_util.h \
coll_tuned_dynamic_file.h \
coll_tuned_dynamic_rules.h \
coll_tuned_topo.c \
coll_tuned_util.c \
coll_tuned_decision_fixed.c \
coll_tuned_decision_dynamic.c \
coll_tuned_dynamic_file.c \
coll_tuned_dynamic_rules.c \
coll_tuned_allreduce.c \
coll_tuned_alltoall.c \
coll_tuned_alltoallv.c \
coll_tuned_allgather.c \
coll_tuned_allgatherv.c \
coll_tuned_barrier.c \
coll_tuned_bcast.c \
coll_tuned_reduce.c \
coll_tuned_reduce_scatter.c \
coll_tuned_gather.c \
coll_tuned_scatter.c \
coll_tuned_component.c \
coll_tuned_module.c
coll_tuned_module.c \
coll_tuned_allgather_decision.c \
coll_tuned_allgatherv_decision.c \
coll_tuned_allreduce_decision.c \
coll_tuned_alltoall_decision.c \
coll_tuned_gather_decision.c \
coll_tuned_alltoallv_decision.c \
coll_tuned_barrier_decision.c \
coll_tuned_reduce_decision.c \
coll_tuned_bcast_decision.c \
coll_tuned_reduce_scatter_decision.c \
coll_tuned_scatter_decision.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la

Просмотреть файл

@ -1,19 +1,8 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -28,61 +17,17 @@
#include "mpi.h"
#include "opal/mca/mca.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_tuned_topo.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
/* also need the dynamic rule structures */
#include "coll_tuned_dynamic_rules.h"
/* some fixed value index vars to simplify certain operations */
typedef enum COLLTYPE {
ALLGATHER = 0, /* 0 */
ALLGATHERV, /* 1 */
ALLREDUCE, /* 2 */
ALLTOALL, /* 3 */
ALLTOALLV, /* 4 */
ALLTOALLW, /* 5 */
BARRIER, /* 6 */
BCAST, /* 7 */
EXSCAN, /* 8 */
GATHER, /* 9 */
GATHERV, /* 10 */
REDUCE, /* 11 */
REDUCESCATTER, /* 12 */
SCAN, /* 13 */
SCATTER, /* 14 */
SCATTERV, /* 15 */
COLLCOUNT /* 16 end counter keep it as last element */
} COLLTYPE_T;
/* defined arg lists to simply auto inclusion of user overriding decision functions */
#define ALLGATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLGATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void * rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLREDUCE_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALL_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALLV_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define ALLTOALLW_ARGS void *sbuf, int *scounts, int *sdisps, struct ompi_datatype_t **sdtypes, void *rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t **rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define BARRIER_ARGS struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define BCAST_ARGS void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define EXSCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define GATHER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define GATHERV_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int *rcounts, int *disps, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define REDUCE_ARGS void *sbuf, void* rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define REDUCESCATTER_ARGS void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCAN_ARGS void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCATTER_ARGS void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
#define SCATTERV_ARGS void *sbuf, int *scounts, int *disps, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module
/* end defined arg lists to simply auto inclusion of user overriding decision functions */
BEGIN_C_DECLS
/* these are the same across all modules and are loaded at component query time */
extern int ompi_coll_tuned_stream;
extern int ompi_coll_tuned_priority;
extern int ompi_coll_tuned_preallocate_memory_comm_size_limit;
extern bool ompi_coll_tuned_use_dynamic_rules;
extern char* ompi_coll_tuned_dynamic_rules_filename;
extern int ompi_coll_tuned_init_tree_fanout;
@ -148,12 +93,6 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_do_forced(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_allgather_intra_bruck(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_ring(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_neighborexchange(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_basic_linear(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_intra_two_procs(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_inter_dec_fixed(ALLGATHER_ARGS);
int ompi_coll_tuned_allgather_inter_dec_dynamic(ALLGATHER_ARGS);
@ -163,11 +102,6 @@ int ompi_coll_tuned_allgatherv_intra_dec_dynamic(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_do_forced(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_do_this(ALLGATHERV_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_allgatherv_intra_bruck(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_ring(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_inter_dec_fixed(ALLGATHERV_ARGS);
int ompi_coll_tuned_allgatherv_inter_dec_dynamic(ALLGATHERV_ARGS);
@ -177,11 +111,6 @@ int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_do_forced(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_ring(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
int ompi_coll_tuned_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_inter_dec_fixed(ALLREDUCE_ARGS);
int ompi_coll_tuned_allreduce_inter_dec_dynamic(ALLREDUCE_ARGS);
@ -191,11 +120,6 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_do_forced(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_do_this(ALLTOALL_ARGS, int algorithm, int faninout, int segsize, int max_requests);
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_alltoall_intra_pairwise(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_bruck(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_basic_linear(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_intra_linear_sync(ALLTOALL_ARGS, int max_requests);
int ompi_coll_tuned_alltoall_intra_two_procs(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_inter_dec_fixed(ALLTOALL_ARGS);
int ompi_coll_tuned_alltoall_inter_dec_dynamic(ALLTOALL_ARGS);
@ -205,8 +129,6 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_intra_do_forced(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_intra_do_this(ALLTOALLV_ARGS, int algorithm);
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_alltoallv_intra_pairwise(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_intra_basic_linear(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_inter_dec_fixed(ALLTOALLV_ARGS);
int ompi_coll_tuned_alltoallv_inter_dec_dynamic(ALLTOALLV_ARGS);
@ -224,12 +146,6 @@ int ompi_coll_tuned_barrier_intra_do_this(BARRIER_ARGS, int algorithm, int fanin
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_barrier_inter_dec_fixed(BARRIER_ARGS);
int ompi_coll_tuned_barrier_inter_dec_dynamic(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_doublering(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_recursivedoubling(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_bruck(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_two_procs(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_linear(BARRIER_ARGS);
int ompi_coll_tuned_barrier_intra_tree(BARRIER_ARGS);
/* Bcast */
int ompi_coll_tuned_bcast_intra_generic( BCAST_ARGS, uint32_t count_by_segment, ompi_coll_tree_t* tree );
@ -238,12 +154,6 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_do_forced(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_bcast_intra_basic_linear(BCAST_ARGS);
int ompi_coll_tuned_bcast_intra_chain(BCAST_ARGS, uint32_t segsize, int32_t chains);
int ompi_coll_tuned_bcast_intra_pipeline(BCAST_ARGS, uint32_t segsize);
int ompi_coll_tuned_bcast_intra_binomial(BCAST_ARGS, uint32_t segsize);
int ompi_coll_tuned_bcast_intra_bintree(BCAST_ARGS, uint32_t segsize);
int ompi_coll_tuned_bcast_intra_split_bintree(BCAST_ARGS, uint32_t segsize);
int ompi_coll_tuned_bcast_inter_dec_fixed(BCAST_ARGS);
int ompi_coll_tuned_bcast_inter_dec_dynamic(BCAST_ARGS);
@ -259,9 +169,6 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS);
int ompi_coll_tuned_gather_intra_do_forced(GATHER_ARGS);
int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_gather_intra_basic_linear(GATHER_ARGS);
int ompi_coll_tuned_gather_intra_binomial(GATHER_ARGS);
int ompi_coll_tuned_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
int ompi_coll_tuned_gather_inter_dec_fixed(GATHER_ARGS);
int ompi_coll_tuned_gather_inter_dec_dynamic(GATHER_ARGS);
@ -278,12 +185,6 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_do_forced(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs);
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_reduce_intra_basic_linear(REDUCE_ARGS);
int ompi_coll_tuned_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
int ompi_coll_tuned_reduce_inter_dec_fixed(REDUCE_ARGS);
int ompi_coll_tuned_reduce_inter_dec_dynamic(REDUCE_ARGS);
@ -293,10 +194,6 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_do_forced(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_inter_dec_fixed(REDUCESCATTER_ARGS);
int ompi_coll_tuned_reduce_scatter_inter_dec_dynamic(REDUCESCATTER_ARGS);
@ -312,8 +209,6 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);
int ompi_coll_tuned_scatter_intra_do_forced(SCATTER_ARGS);
int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize);
int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
int ompi_coll_tuned_scatter_intra_basic_linear(SCATTER_ARGS);
int ompi_coll_tuned_scatter_intra_binomial(SCATTER_ARGS);
int ompi_coll_tuned_scatter_inter_dec_fixed(SCATTER_ARGS);
int ompi_coll_tuned_scatter_inter_dec_dynamic(SCATTER_ARGS);
@ -325,16 +220,6 @@ int ompi_coll_tuned_scatterv_inter_dec_dynamic(SCATTERV_ARGS);
int mca_coll_tuned_ft_event(int state);
/* Utility functions */
static inline void ompi_coll_tuned_free_reqs(ompi_request_t **reqs, int count)
{
int i;
for (i = 0; i < count; ++i)
ompi_request_free(&reqs[i]);
}
struct mca_coll_tuned_component_t {
/** Base coll component */
mca_coll_base_component_2_0_0_t super;
@ -359,200 +244,17 @@ typedef struct mca_coll_tuned_component_t mca_coll_tuned_component_t;
*/
OMPI_MODULE_DECLSPEC extern mca_coll_tuned_component_t mca_coll_tuned_component;
/*
* Data structure for hanging data off the communicator
* i.e. per module instance
*/
struct mca_coll_tuned_comm_t {
/* standard data for requests and PML usage */
/* Precreate space for requests
* Note this does not effect basic,
* but if in wrong context can confuse a debugger
* this is controlled by an MCA param
*/
ompi_request_t **mcct_reqs;
int mcct_num_reqs;
/*
* tuned topo information caching per communicator
*
* for each communicator we cache the topo information so we can
* reuse without regenerating if we change the root, [or fanout]
* then regenerate and recache this information
*/
/* general tree with n fan out */
ompi_coll_tree_t *cached_ntree;
int cached_ntree_root;
int cached_ntree_fanout;
/* binary tree */
ompi_coll_tree_t *cached_bintree;
int cached_bintree_root;
/* binomial tree */
ompi_coll_tree_t *cached_bmtree;
int cached_bmtree_root;
/* binomial tree */
ompi_coll_tree_t *cached_in_order_bmtree;
int cached_in_order_bmtree_root;
/* chained tree (fanout followed by pipelines) */
ompi_coll_tree_t *cached_chain;
int cached_chain_root;
int cached_chain_fanout;
/* pipeline */
ompi_coll_tree_t *cached_pipeline;
int cached_pipeline_root;
/* in-order binary tree (root of the in-order binary tree is rank 0) */
ompi_coll_tree_t *cached_in_order_bintree;
/* moving to the component */
ompi_coll_com_rule_t *com_rules[COLLCOUNT]; /* the communicator rules for each MPI collective for ONLY my comsize */
/* for forced algorithms we store the information on the module */
/* previously we only had one shared copy, ops, it really is per comm/module */
coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
};
typedef struct mca_coll_tuned_comm_t mca_coll_tuned_comm_t;
struct mca_coll_tuned_module_t {
mca_coll_base_module_t super;
mca_coll_tuned_comm_t *tuned_data;
mca_coll_base_module_t super;
/* for forced algorithms we store the information on the module */
/* previously we only had one shared copy, ops, it really is per comm/module */
coll_tuned_force_algorithm_params_t user_forced[COLLCOUNT];
/* the communicator rules for each MPI collective for ONLY my comsize */
ompi_coll_com_rule_t *com_rules[COLLCOUNT];
};
typedef struct mca_coll_tuned_module_t mca_coll_tuned_module_t;
OBJ_CLASS_DECLARATION(mca_coll_tuned_module_t);
static inline void mca_coll_tuned_free_reqs(ompi_request_t ** reqs,
int count)
{
int i;
for (i = 0; i < count; ++i)
ompi_request_free(reqs + i);
}
END_C_DECLS
#define COLL_TUNED_UPDATE_BINTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_bintree) \
&& (coll_comm->cached_bintree_root == (ROOT)) ) ) { \
if( coll_comm->cached_bintree ) { /* destroy previous binomial if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bintree) ); \
} \
coll_comm->cached_bintree = ompi_coll_tuned_topo_build_tree(2,(OMPI_COMM),(ROOT)); \
coll_comm->cached_bintree_root = (ROOT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_bmtree) \
&& (coll_comm->cached_bmtree_root == (ROOT)) ) ) { \
if( coll_comm->cached_bmtree ) { /* destroy previous binomial if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_bmtree) ); \
} \
coll_comm->cached_bmtree = ompi_coll_tuned_topo_build_bmtree( (OMPI_COMM), (ROOT) ); \
coll_comm->cached_bmtree_root = (ROOT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_IN_ORDER_BMTREE( OMPI_COMM, TUNED_MODULE, ROOT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_in_order_bmtree) \
&& (coll_comm->cached_in_order_bmtree_root == (ROOT)) ) ) { \
if( coll_comm->cached_in_order_bmtree ) { /* destroy previous binomial if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_in_order_bmtree) ); \
} \
coll_comm->cached_in_order_bmtree = ompi_coll_tuned_topo_build_in_order_bmtree( (OMPI_COMM), (ROOT) ); \
coll_comm->cached_in_order_bmtree_root = (ROOT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_PIPELINE( OMPI_COMM, TUNED_MODULE, ROOT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_pipeline) \
&& (coll_comm->cached_pipeline_root == (ROOT)) ) ) { \
if (coll_comm->cached_pipeline) { /* destroy previous pipeline if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_pipeline) ); \
} \
coll_comm->cached_pipeline = ompi_coll_tuned_topo_build_chain( 1, (OMPI_COMM), (ROOT) ); \
coll_comm->cached_pipeline_root = (ROOT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_CHAIN( OMPI_COMM, TUNED_MODULE, ROOT, FANOUT ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !( (coll_comm->cached_chain) \
&& (coll_comm->cached_chain_root == (ROOT)) \
&& (coll_comm->cached_chain_fanout == (FANOUT)) ) ) { \
if( coll_comm->cached_chain) { /* destroy previous chain if defined */ \
ompi_coll_tuned_topo_destroy_tree( &(coll_comm->cached_chain) ); \
} \
coll_comm->cached_chain = ompi_coll_tuned_topo_build_chain((FANOUT), (OMPI_COMM), (ROOT)); \
coll_comm->cached_chain_root = (ROOT); \
coll_comm->cached_chain_fanout = (FANOUT); \
} \
} while (0)
#define COLL_TUNED_UPDATE_IN_ORDER_BINTREE( OMPI_COMM, TUNED_MODULE ) \
do { \
mca_coll_tuned_comm_t* coll_comm = (TUNED_MODULE)->tuned_data; \
if( !(coll_comm->cached_in_order_bintree) ) { \
/* In-order binary tree topology is defined by communicator size */ \
/* Thus, there is no need to destroy anything */ \
coll_comm->cached_in_order_bintree = \
ompi_coll_tuned_topo_build_in_order_bintree((OMPI_COMM)); \
} \
} while (0)
/**
* This macro give a generic way to compute the best count of
* the segment (i.e. the number of complete datatypes that
* can fit in the specified SEGSIZE). Beware, when this macro
* is called, the SEGCOUNT should be initialized to the count as
* expected by the collective call.
*/
#define COLL_TUNED_COMPUTED_SEGCOUNT(SEGSIZE, TYPELNG, SEGCOUNT) \
if( ((SEGSIZE) >= (TYPELNG)) && \
((SEGSIZE) < ((TYPELNG) * (SEGCOUNT))) ) { \
size_t residual; \
(SEGCOUNT) = (int)((SEGSIZE) / (TYPELNG)); \
residual = (SEGSIZE) - (SEGCOUNT) * (TYPELNG); \
if( residual > ((TYPELNG) >> 1) ) \
(SEGCOUNT)++; \
} \
/**
* This macro gives a generic wait to compute the well distributed block counts
* when the count and number of blocks are fixed.
* Macro returns "early-block" count, "late-block" count, and "split-index"
* which is the block at which we switch from "early-block" count to
* the "late-block" count.
* count = split_index * early_block_count +
* (block_count - split_index) * late_block_count
* We do not perform ANY error checks - make sure that the input values
* make sense (eg. count > num_blocks).
*/
#define COLL_TUNED_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX, \
EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS; \
SPLIT_INDEX = COUNT % NUM_BLOCKS; \
if (0 != SPLIT_INDEX) { \
EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1; \
} \
#endif /* MCA_COLL_TUNED_EXPORT_H */
#endif /* MCA_COLL_TUNED_EXPORT_H */

Просмотреть файл

@ -0,0 +1,218 @@
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "opal/util/bit_ops.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_base_util.h"
#include "coll_tuned.h"
/* allgather algorithm variables */
static int coll_tuned_allgather_forced_algorithm = 0;
static int coll_tuned_allgather_segment_size = 0;
static int coll_tuned_allgather_tree_fanout;
static int coll_tuned_allgather_chain_fanout;
/* valid values for coll_tuned_allgather_forced_algorithm */
static mca_base_var_enum_value_t allgather_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "bruck"},
{3, "recursive_doubling"},
{4, "ring"},
{5, "neighbor"},
{6, "two_proc"},
{0, NULL}
};
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != allgather_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[ALLGATHER] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_count",
"Number of allgather algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allgather_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_allgather_algorithms", allgather_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm",
"Which allallgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_allgather_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_segmentsize",
"Segment size in bytes used by default for allgather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_segment_size);
coll_tuned_allgather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_tree_fanout",
"Fanout for n-tree used for allgather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_tree_fanout);
coll_tuned_allgather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgather_algorithm_chain_fanout",
"Fanout for chains used for allgather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgather_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allgather_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_forced selected algorithm %d",
tuned_module->user_forced[ALLGATHER].algorithm));
switch (tuned_module->user_forced[ALLGATHER].algorithm) {
case (0):
return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (1):
return ompi_coll_base_allgather_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (2):
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (3):
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (4):
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (5):
return ompi_coll_base_allgather_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (6):
return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[ALLGATHER].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_allgather_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (1):
return ompi_coll_base_allgather_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (2):
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (3):
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (4):
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (5):
return ompi_coll_base_allgather_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
case (6):
return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLGATHER]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -0,0 +1,212 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "coll_tuned.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/* allgatherv algorithm variables */
static int coll_tuned_allgatherv_forced_algorithm = 0;
static int coll_tuned_allgatherv_segment_size = 0;
static int coll_tuned_allgatherv_tree_fanout;
static int coll_tuned_allgatherv_chain_fanout;
/* valid values for coll_tuned_allgatherv_forced_algorithm */
static mca_base_var_enum_value_t allgatherv_algorithms[] = {
{0, "ignore"},
{1, "default"},
{2, "bruck"},
{3, "ring"},
{4, "neighbor"},
{5, "two_proc"},
{0, NULL}
};
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != allgatherv_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_count",
"Number of allgatherv algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allgatherv_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_allgatherv_algorithms", allgatherv_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm",
"Which allallgatherv algorithm is used. Can be locked down to choice of: 0 ignore, 1 default (allgathervv + bcast), 2 bruck, 3 ring, 4 neighbor exchange, 5: two proc only.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_allgatherv_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_segmentsize",
"Segment size in bytes used by default for allgatherv algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_segment_size);
coll_tuned_allgatherv_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_tree_fanout",
"Fanout for n-tree used for allgatherv algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_tree_fanout);
coll_tuned_allgatherv_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allgatherv_algorithm_chain_fanout",
"Fanout for chains used for allgatherv algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allgatherv_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *rdispls,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_forced selected algorithm %d",
tuned_module->user_forced[ALLGATHERV].algorithm));
switch (tuned_module->user_forced[ALLGATHERV].algorithm) {
case (0):
return ompi_coll_tuned_allgatherv_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (1):
return ompi_coll_base_allgatherv_intra_basic_default(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (2):
return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (3):
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (4):
return ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (5):
return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[ALLGATHERV].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts,
int *rdispls,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout,
int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_allgatherv_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (1):
return ompi_coll_base_allgatherv_intra_basic_default(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (2):
return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (3):
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (4):
return ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
case (5):
return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:allgatherv_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -0,0 +1,182 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "opal/util/bit_ops.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "coll_tuned.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/* allreduce algorithm variables */
static int coll_tuned_allreduce_algorithm_count = 5;
static int coll_tuned_allreduce_forced_algorithm = 0;
static int coll_tuned_allreduce_segment_size = 0;
static int coll_tuned_allreduce_tree_fanout;
static int coll_tuned_allreduce_chain_fanout;
/* valid values for coll_tuned_allreduce_forced_algorithm */
static mca_base_var_enum_value_t allreduce_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "nonoverlapping"},
{3, "recursive_doubling"},
{4, "ring"},
{5, "segmented_ring"},
{0, NULL}
};
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != allreduce_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_count",
"Number of allreduce algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_allreduce_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_allreduce_algorithms", allreduce_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm",
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast), 3 recursive doubling, 4 ring, 5 segmented ring",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_allreduce_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_segmentsize",
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_segment_size);
coll_tuned_allreduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_tree_fanout",
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_tree_fanout);
coll_tuned_allreduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"allreduce_algorithm_chain_fanout",
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_allreduce_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d, segment size %d",
tuned_module->user_forced[ALLREDUCE].algorithm,
tuned_module->user_forced[ALLREDUCE].segsize));
switch (tuned_module->user_forced[ALLREDUCE].algorithm) {
case (0):
return ompi_coll_tuned_allreduce_intra_dec_fixed(sbuf, rbuf, count, dtype, op, comm, module);
case (1):
return ompi_coll_base_allreduce_intra_basic_linear(sbuf, rbuf, count, dtype, op, comm, module);
case (2):
return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module);
case (3):
return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module);
case (4):
return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module);
case (5):
return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, tuned_module->user_forced[ALLREDUCE].segsize);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[ALLREDUCE].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_allreduce_intra_dec_fixed(sbuf, rbuf, count, dtype, op, comm, module);
case (1):
return ompi_coll_base_allreduce_intra_basic_linear(sbuf, rbuf, count, dtype, op, comm, module);
case (2):
return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module);
case (3):
return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module);
case (4):
return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module);
case (5):
return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -0,0 +1,204 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/* alltoall algorithm variables */
static int coll_tuned_alltoall_forced_algorithm = 0;
static int coll_tuned_alltoall_segment_size = 0;
static int coll_tuned_alltoall_max_requests;
static int coll_tuned_alltoall_tree_fanout;
static int coll_tuned_alltoall_chain_fanout;
/* valid values for coll_tuned_alltoall_forced_algorithm */
static mca_base_var_enum_value_t alltoall_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "pairwise"},
{3, "modified_bruck"},
{4, "linear_sync"},
{5, "two_proc"},
{0, NULL}
};
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t*new_enum;
int cnt;
for( cnt = 0; NULL != alltoall_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_count",
"Number of alltoall algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_alltoall_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_alltoall_algorithms", alltoall_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm",
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: linear with sync, 5:two proc only.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_alltoall_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_segmentsize",
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_segment_size);
coll_tuned_alltoall_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_tree_fanout",
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_tree_fanout);
coll_tuned_alltoall_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_chain_fanout",
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_chain_fanout);
coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
mca_param_indices->max_requests_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_max_requests",
"Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoall_max_requests);
if (mca_param_indices->max_requests_param_index < 0) {
return mca_param_indices->max_requests_param_index;
}
if (coll_tuned_alltoall_max_requests < 0) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to system level default %d \n",
ompi_coll_tuned_init_max_requests );
}
coll_tuned_alltoall_max_requests = 0;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d",
tuned_module->user_forced[ALLTOALL].algorithm));
switch (tuned_module->user_forced[ALLTOALL].algorithm) {
case (0):
return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (1):
return ompi_coll_base_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (2):
return ompi_coll_base_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (3):
return ompi_coll_base_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (4):
return ompi_coll_base_alltoall_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module,
tuned_module->user_forced[ALLTOALL].max_requests);
case (5):
return ompi_coll_base_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize,
int max_requests)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_alltoall_intra_dec_fixed(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (1):
return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (2):
return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (3):
return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
case (4):
return ompi_coll_base_alltoall_intra_linear_sync(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, max_requests);
case (5):
return ompi_coll_base_alltoall_intra_two_procs(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -0,0 +1,156 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/* alltoallv algorithm variables */
static int coll_tuned_alltoallv_algorithm_count = 2;
static int coll_tuned_alltoallv_forced_algorithm = 0;
/* valid values for coll_tuned_alltoallv_forced_algorithm */
static mca_base_var_enum_value_t alltoallv_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "pairwise"},
{0, NULL}
};
/*
* The following are used by dynamic and forced rules. Publish
* details of each algorithm and if its forced/fixed/locked in as you add
* methods/algorithms you must update this and the query/map routines.
* This routine is called by the component only. This makes sure that
* the mca parameters are set to their initial values and perms.
* Module does not call this. They call the forced_getvalues routine
* instead.
*/
int ompi_coll_tuned_alltoallv_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t
*mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != alltoallv_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoallv_algorithm_count",
"Number of alltoallv algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_alltoallv_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_alltoallv_algorithms", alltoallv_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoallv_algorithm",
"Which alltoallv algorithm is used. "
"Can be locked down to choice of: 0 ignore, "
"1 basic linear, 2 pairwise.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_alltoallv_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_alltoallv_intra_do_forced(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_do_forced selected algorithm %d",
tuned_module->user_forced[ALLTOALLV].algorithm));
switch (tuned_module->user_forced[ALLTOALLV].algorithm) {
case (0):
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (1):
return ompi_coll_base_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (2):
return ompi_coll_base_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_do_forced attempt to "
"select algorithm %d when only 0-%d is valid.",
tuned_module->user_forced[ALLTOALLV].algorithm,
ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
return (MPI_ERR_ARG);
}
/* If the user selects dynamic rules and specifies the algorithm to
* use, then this function is called. */
int ompi_coll_tuned_alltoallv_intra_do_this(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoallv_intra_do_this selected algorithm %d ",
algorithm));
switch (algorithm) {
case (0):
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (1):
return ompi_coll_base_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
case (2):
return ompi_coll_base_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:alltoall_intra_do_this attempt to select "
"algorithm %d when only 0-%d is valid.",
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALLV]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -0,0 +1,135 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "opal/util/bit_ops.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/* barrier algorithm variables */
static int coll_tuned_barrier_forced_algorithm = 0;
/* valid values for coll_tuned_barrier_forced_algorithm */
static mca_base_var_enum_value_t barrier_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "double_ring"},
{3, "recursive_doubling"},
{4, "bruck"},
{5, "two_proc"},
{6, "tree"},
{0, NULL}
};
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map */
/* routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values */
/* and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != barrier_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[BARRIER] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm_count",
"Number of barrier algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_barrier_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_barrier_algorithms", barrier_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"barrier_algorithm",
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_barrier_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:barrier_intra_do_forced selected algorithm %d",
tuned_module->user_forced[BARRIER].algorithm));
switch (tuned_module->user_forced[BARRIER].algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed(comm, module);
case (1): return ompi_coll_base_barrier_intra_basic_linear(comm, module);
case (2): return ompi_coll_base_barrier_intra_doublering(comm, module);
case (3): return ompi_coll_base_barrier_intra_recursivedoubling(comm, module);
case (4): return ompi_coll_base_barrier_intra_bruck(comm, module);
case (5): return ompi_coll_base_barrier_intra_two_procs(comm, module);
case (6): return ompi_coll_base_barrier_intra_tree(comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[BARRIER].algorithm,
ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d",
algorithm, faninout));
switch (algorithm) {
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed(comm, module);
case (1): return ompi_coll_base_barrier_intra_basic_linear(comm, module);
case (2): return ompi_coll_base_barrier_intra_doublering(comm, module);
case (3): return ompi_coll_base_barrier_intra_recursivedoubling(comm, module);
case (4): return ompi_coll_base_barrier_intra_bruck(comm, module);
case (5): return ompi_coll_base_barrier_intra_two_procs(comm, module);
case (6): return ompi_coll_base_barrier_intra_tree(comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -0,0 +1,183 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/* bcast algorithm variables */
static int coll_tuned_bcast_algorithm_count = 6;
static int coll_tuned_bcast_forced_algorithm = 0;
static int coll_tuned_bcast_segment_size = 0;
static int coll_tuned_bcast_tree_fanout;
static int coll_tuned_bcast_chain_fanout;
/* valid values for coll_tuned_bcast_forced_algorithm */
static mca_base_var_enum_value_t bcast_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "chain"},
{3, "pipeline"},
{4, "split_binary_tree"},
{5, "binary_tree"},
{6, "binomial"},
{0, NULL}
};
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values and perms */
/* module does not call this they call the forced_getvalues routine instead */
int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != bcast_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[BCAST] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_count",
"Number of bcast algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_bcast_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_bcast_algorithms", bcast_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm",
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree, 6: binomial tree.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_bcast_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_segmentsize",
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_segment_size);
coll_tuned_bcast_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_tree_fanout",
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_tree_fanout);
coll_tuned_bcast_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"bcast_algorithm_chain_fanout",
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_bcast_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d",
tuned_module->user_forced[BCAST].algorithm));
switch (tuned_module->user_forced[BCAST].algorithm) {
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
case (1): return ompi_coll_base_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
case (2): return ompi_coll_base_bcast_intra_chain( buf, count, dtype, root, comm, module,
tuned_module->user_forced[BCAST].segsize,
tuned_module->user_forced[BCAST].chain_fanout );
case (3): return ompi_coll_base_bcast_intra_pipeline( buf, count, dtype, root, comm, module,
tuned_module->user_forced[BCAST].segsize );
case (4): return ompi_coll_base_bcast_intra_split_bintree( buf, count, dtype, root, comm, module,
tuned_module->user_forced[BCAST].segsize );
case (5): return ompi_coll_base_bcast_intra_bintree( buf, count, dtype, root, comm, module,
tuned_module->user_forced[BCAST].segsize );
case (6): return ompi_coll_base_bcast_intra_binomial( buf, count, dtype, root, comm, module,
tuned_module->user_forced[BCAST].segsize );
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
struct ompi_datatype_t *dtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_bcast_intra_dec_fixed( buf, count, dtype, root, comm, module );
case (1):
return ompi_coll_base_bcast_intra_basic_linear( buf, count, dtype, root, comm, module );
case (2):
return ompi_coll_base_bcast_intra_chain( buf, count, dtype, root, comm, module, segsize, faninout );
case (3):
return ompi_coll_base_bcast_intra_pipeline( buf, count, dtype, root, comm, module, segsize );
case (4):
return ompi_coll_base_bcast_intra_split_bintree( buf, count, dtype, root, comm, module, segsize );
case (5):
return ompi_coll_base_bcast_intra_bintree( buf, count, dtype, root, comm, module, segsize );
case (6):
return ompi_coll_base_bcast_intra_binomial( buf, count, dtype, root, comm, module, segsize );
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -2,10 +2,10 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
@ -44,7 +44,6 @@ const char *ompi_coll_tuned_component_version_string =
*/
int ompi_coll_tuned_stream = -1;
int ompi_coll_tuned_priority = 30;
int ompi_coll_tuned_preallocate_memory_comm_size_limit = (32 * 1024);
bool ompi_coll_tuned_use_dynamic_rules = false;
char* ompi_coll_tuned_dynamic_rules_filename = (char*) NULL;
int ompi_coll_tuned_init_tree_fanout = 4;
@ -121,16 +120,6 @@ static int tuned_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_coll_tuned_priority);
/* parameter for pre-allocated memory requests etc */
ompi_coll_tuned_preallocate_memory_comm_size_limit = (32 * 1024);
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"pre_allocate_memory_comm_size_limit",
"Size of communicator were we stop pre-allocating memory for the fixed internal buffer used for message requests etc that is hung off the communicator data segment. I.e. if you have a 100'000 nodes you might not want to pre-allocate 200'000 request handle slots per communicator instance!",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_coll_tuned_preallocate_memory_comm_size_limit);
/* some initial guesses at topology parameters */
ompi_coll_tuned_init_tree_fanout = 4;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
@ -272,56 +261,13 @@ static int tuned_close(void)
static void
mca_coll_tuned_module_construct(mca_coll_tuned_module_t *module)
{
module->tuned_data = NULL;
}
static void
mca_coll_tuned_module_destruct(mca_coll_tuned_module_t *module)
{
mca_coll_tuned_comm_t *data;
/* Free the space in the data mpool and the data hanging off the
communicator */
data = module->tuned_data;
if (NULL != data) {
#if OPAL_ENABLE_DEBUG
/* Reset the reqs to NULL/0 -- they'll be freed as part of freeing
the generel c_coll_selected_data */
data->mcct_reqs = NULL;
data->mcct_num_reqs = 0;
#endif
/* free any cached information that has been allocated */
if (data->cached_ntree) { /* destroy general tree if defined */
ompi_coll_tuned_topo_destroy_tree (&data->cached_ntree);
}
if (data->cached_bintree) { /* destroy bintree if defined */
ompi_coll_tuned_topo_destroy_tree (&data->cached_bintree);
}
if (data->cached_bmtree) { /* destroy bmtree if defined */
ompi_coll_tuned_topo_destroy_tree (&data->cached_bmtree);
}
if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bmtree);
}
if (data->cached_chain) { /* destroy general chain if defined */
ompi_coll_tuned_topo_destroy_tree (&data->cached_chain);
}
if (data->cached_pipeline) { /* destroy pipeline if defined */
ompi_coll_tuned_topo_destroy_tree (&data->cached_pipeline);
}
if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bintree);
}
free(data);
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
for( int i = 0; i < COLLCOUNT; i++ ) {
tuned_module->user_forced[i].algorithm = 0;
tuned_module->com_rules[i] = NULL;
}
}
OBJ_CLASS_INSTANCE(mca_coll_tuned_module_t,
mca_coll_base_module_t,
mca_coll_tuned_module_construct,
mca_coll_tuned_module_destruct);
OBJ_CLASS_INSTANCE(mca_coll_tuned_module_t, mca_coll_base_module_t,
mca_coll_tuned_module_construct, NULL);

Просмотреть файл

@ -2,18 +2,18 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -28,13 +28,10 @@
#include "ompi/mca/coll/base/coll_tags.h"
#include "coll_tuned.h"
#include "coll_tuned.h"
/*
* Notes on evaluation rules and ordering
*
* The order is:
* Notes on evaluation rules and ordering
*
* The order is:
* use file based rules if presented (-coll_tuned_dynamic_rules_filename = rules)
* Else
* use forced rules (-coll_tuned_dynamic_ALG_intra_algorithm = algorithm-number)
@ -58,12 +55,11 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_dynamic"));
/* check to see if we have some filebased rules */
if (data->com_rules[ALLREDUCE]) {
if (tuned_module->com_rules[ALLREDUCE]) {
/* we do, so calc the message size or what ever we need and use this for the evaluation */
int alg, faninout, segsize, ignoreme;
size_t dsize;
@ -71,7 +67,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
ompi_datatype_type_size (dtype, &dsize);
dsize *= count;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLREDUCE],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLREDUCE],
dsize, &faninout, &segsize, &ignoreme);
if (alg) {
@ -82,7 +78,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
} /* found a method */
} /*end if any com rules to check */
if (data->user_forced[ALLREDUCE].algorithm) {
if (tuned_module->user_forced[ALLREDUCE].algorithm) {
return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op,
comm, module);
}
@ -91,27 +87,26 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
}
/*
* alltoall_intra_dec
* alltoall_intra_dec
*
* Function: - seletects alltoall algorithm to use
* Accepts: - same arguments as MPI_Alltoall()
* Returns: - MPI_SUCCESS or error code (passed from the bcast implementation)
*/
int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_dynamic"));
/* check to see if we have some filebased rules */
if (data->com_rules[ALLTOALL]) {
if (tuned_module->com_rules[ALLTOALL]) {
/* we do, so calc the message size or what ever we need and use this for the evaluation */
int comsize;
int alg, faninout, segsize, max_requests;
@ -121,7 +116,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
comsize = ompi_comm_size(comm);
dsize *= (ptrdiff_t)comsize * (ptrdiff_t)scount;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLTOALL],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLTOALL],
dsize, &faninout, &segsize, &max_requests);
if (alg) {
@ -133,7 +128,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
} /* found a method */
} /*end if any com rules to check */
if (data->user_forced[ALLTOALL].algorithm) {
if (tuned_module->user_forced[ALLTOALL].algorithm) {
return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
@ -152,12 +147,11 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoallv_intra_dec_dynamic"));
@ -167,10 +161,10 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
* This allow the users to specify the alltoallv algorithm to be used only
* based on the communicator size.
*/
if (data->com_rules[ALLTOALLV]) {
if (tuned_module->com_rules[ALLTOALLV]) {
int alg, faninout, segsize, max_requests;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLTOALLV],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLTOALLV],
0, &faninout, &segsize, &max_requests);
if (alg) {
@ -182,7 +176,7 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
} /* found a method */
} /*end if any com rules to check */
if (data->user_forced[ALLTOALLV].algorithm) {
if (tuned_module->user_forced[ALLTOALLV].algorithm) {
return ompi_coll_tuned_alltoallv_intra_do_forced(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps, rdtype,
comm, module);
@ -193,7 +187,7 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
}
/*
* barrier_intra_dec
* barrier_intra_dec
*
* Function: - seletects barrier algorithm to use
* Accepts: - same arguments as MPI_Barrier()
@ -203,16 +197,15 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_barrier_intra_dec_dynamic"));
/* check to see if we have some filebased rules */
if (data->com_rules[BARRIER]) {
if (tuned_module->com_rules[BARRIER]) {
/* we do, so calc the message size or what ever we need and use this for the evaluation */
int alg, faninout, segsize, ignoreme;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[BARRIER],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[BARRIER],
0, &faninout, &segsize, &ignoreme);
if (alg) {
@ -222,14 +215,14 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
} /* found a method */
} /*end if any com rules to check */
if (data->user_forced[BARRIER].algorithm) {
if (tuned_module->user_forced[BARRIER].algorithm) {
return ompi_coll_tuned_barrier_intra_do_forced (comm, module);
}
return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module);
}
/*
* bcast_intra_dec
* bcast_intra_dec
*
* Function: - seletects broadcast algorithm to use
* Accepts: - same arguments as MPI_Bcast()
@ -241,12 +234,11 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:bcast_intra_dec_dynamic"));
/* check to see if we have some filebased rules */
if (data->com_rules[BCAST]) {
if (tuned_module->com_rules[BCAST]) {
/* we do, so calc the message size or what ever we need and use this for the evaluation */
int alg, faninout, segsize, ignoreme;
size_t dsize;
@ -254,7 +246,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
ompi_datatype_type_size (datatype, &dsize);
dsize *= count;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[BCAST],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[BCAST],
dsize, &faninout, &segsize, &ignoreme);
if (alg) {
@ -266,7 +258,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
} /*end if any com rules to check */
if (data->user_forced[BCAST].algorithm) {
if (tuned_module->user_forced[BCAST].algorithm) {
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root,
comm, module);
}
@ -275,12 +267,12 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
}
/*
* reduce_intra_dec
* reduce_intra_dec
*
* Function: - seletects reduce algorithm to use
* Accepts: - same arguments as MPI_reduce()
* Returns: - MPI_SUCCESS or error code (passed from the reduce implementation)
*
*
*/
int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
int count, struct ompi_datatype_t* datatype,
@ -289,12 +281,11 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_intra_dec_dynamic"));
/* check to see if we have some filebased rules */
if (data->com_rules[REDUCE]) {
if (tuned_module->com_rules[REDUCE]) {
/* we do, so calc the message size or what ever we need and use this for the evaluation */
int alg, faninout, segsize, max_requests;
@ -303,21 +294,21 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
ompi_datatype_type_size (datatype, &dsize);
dsize *= count;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[REDUCE],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[REDUCE],
dsize, &faninout, &segsize, &max_requests);
if (alg) {
/* we have found a valid choice from the file based rules for this message size */
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
op, root,
comm, module,
alg, faninout,
segsize,
alg, faninout,
segsize,
max_requests);
} /* found a method */
} /*end if any com rules to check */
if (data->user_forced[REDUCE].algorithm) {
if (tuned_module->user_forced[REDUCE].algorithm) {
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype,
op, root,
comm, module);
@ -328,15 +319,15 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
}
/*
* reduce_scatter_intra_dec
* reduce_scatter_intra_dec
*
* Function: - seletects reduce_scatter algorithm to use
* Accepts: - same arguments as MPI_Reduce_scatter()
* Returns: - MPI_SUCCESS or error code (passed from
* the reduce_scatter implementation)
*
*
*/
int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
@ -344,13 +335,12 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_intra_dec_dynamic"));
/* check to see if we have some filebased rules */
if (data->com_rules[REDUCESCATTER]) {
/* we do, so calc the message size or what ever we need and use
if (tuned_module->com_rules[REDUCESCATTER]) {
/* we do, so calc the message size or what ever we need and use
this for the evaluation */
int alg, faninout, segsize, ignoreme, i, count, size;
size_t dsize;
@ -359,21 +349,21 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
ompi_datatype_type_size (dtype, &dsize);
dsize *= count;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[REDUCESCATTER],
dsize, &faninout,
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[REDUCESCATTER],
dsize, &faninout,
&segsize, &ignoreme);
if (alg) {
if (alg) {
/* we have found a valid choice from the file based rules for this message size */
return ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts,
dtype, op,
comm, module,
alg, faninout,
alg, faninout,
segsize);
} /* found a method */
} /*end if any com rules to check */
if (data->user_forced[REDUCESCATTER].algorithm) {
return ompi_coll_tuned_reduce_scatter_intra_do_forced (sbuf, rbuf, rcounts,
if (tuned_module->user_forced[REDUCESCATTER].algorithm) {
return ompi_coll_tuned_reduce_scatter_intra_do_forced (sbuf, rbuf, rcounts,
dtype, op,
comm, module);
}
@ -383,7 +373,7 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
}
/*
* allgather_intra_dec
* allgather_intra_dec
*
* Function: - seletects allgather algorithm to use
* Accepts: - same arguments as MPI_Allgather()
@ -391,58 +381,57 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
* allgather function).
*/
int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_allgather_intra_dec_dynamic"));
if (data->com_rules[ALLGATHER]) {
if (tuned_module->com_rules[ALLGATHER]) {
/* We have file based rules:
- calculate message size and other necessary information */
int comsize;
int alg, faninout, segsize, ignoreme;
size_t dsize;
ompi_datatype_type_size (sdtype, &dsize);
comsize = ompi_comm_size(comm);
dsize *= (ptrdiff_t)comsize * (ptrdiff_t)scount;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHER],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHER],
dsize, &faninout, &segsize, &ignoreme);
if (alg) {
/* we have found a valid choice from the file based rules for
if (alg) {
/* we have found a valid choice from the file based rules for
this message size */
return ompi_coll_tuned_allgather_intra_do_this (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module,
alg, faninout, segsize);
}
}
}
/* We do not have file based rules */
if (data->user_forced[ALLGATHER].algorithm) {
if (tuned_module->user_forced[ALLGATHER].algorithm) {
/* User-forced algorithm */
return ompi_coll_tuned_allgather_intra_do_forced (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
return ompi_coll_tuned_allgather_intra_do_forced (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
/* Use default decision */
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
/*
* allgatherv_intra_dec
* allgatherv_intra_dec
*
* Function: - seletects allgatherv algorithm to use
* Accepts: - same arguments as MPI_Allgatherv()
@ -450,71 +439,69 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
* allgatherv function).
*/
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(void *sbuf, int scount,
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts,
void* rbuf, int *rcounts,
int *rdispls,
struct ompi_datatype_t *rdtype,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_allgatherv_intra_dec_dynamic"));
if (data->com_rules[ALLGATHERV]) {
if (tuned_module->com_rules[ALLGATHERV]) {
/* We have file based rules:
- calculate message size and other necessary information */
int comsize, i;
int alg, faninout, segsize, ignoreme;
size_t dsize, total_size;
comsize = ompi_comm_size(comm);
comsize = ompi_comm_size(comm);
ompi_datatype_type_size (sdtype, &dsize);
total_size = 0;
for (i = 0; i < comsize; i++) { total_size += dsize * rcounts[i]; }
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHERV],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHERV],
total_size, &faninout, &segsize, &ignoreme);
if (alg) {
/* we have found a valid choice from the file based rules for
if (alg) {
/* we have found a valid choice from the file based rules for
this message size */
return ompi_coll_tuned_allgatherv_intra_do_this (sbuf, scount, sdtype,
rbuf, rcounts,
rbuf, rcounts,
rdispls, rdtype,
comm, module,
alg, faninout, segsize);
}
}
}
/* We do not have file based rules */
if (data->user_forced[ALLGATHERV].algorithm) {
if (tuned_module->user_forced[ALLGATHERV].algorithm) {
/* User-forced algorithm */
return ompi_coll_tuned_allgatherv_intra_do_forced (sbuf, scount, sdtype,
rbuf, rcounts,
rdispls, rdtype,
return ompi_coll_tuned_allgatherv_intra_do_forced (sbuf, scount, sdtype,
rbuf, rcounts,
rdispls, rdtype,
comm, module);
}
/* Use default decision */
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcounts,
rdispls, rdtype,
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcounts,
rdispls, rdtype,
comm, module);
}
int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_dec_dynamic"));
@ -522,15 +509,15 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
/**
* check to see if we have some filebased rules.
*/
if (data->com_rules[GATHER]) {
if (tuned_module->com_rules[GATHER]) {
int comsize, alg, faninout, segsize, max_requests;
size_t dsize;
comsize = ompi_comm_size(comm);
comsize = ompi_comm_size(comm);
ompi_datatype_type_size (sdtype, &dsize);
dsize *= comsize;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[GATHER],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[GATHER],
dsize, &faninout, &segsize, &max_requests);
if (alg) {
@ -542,26 +529,25 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
} /* found a method */
} /*end if any com rules to check */
if (data->user_forced[GATHER].algorithm) {
if (tuned_module->user_forced[GATHER].algorithm) {
return ompi_coll_tuned_gather_intra_do_forced (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
}
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
}
int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_scatter_intra_dec_dynamic"));
@ -569,15 +555,15 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
/**
* check to see if we have some filebased rules.
*/
if (data->com_rules[SCATTER]) {
if (tuned_module->com_rules[SCATTER]) {
int comsize, alg, faninout, segsize, max_requests;
size_t dsize;
comsize = ompi_comm_size(comm);
comsize = ompi_comm_size(comm);
ompi_datatype_type_size (sdtype, &dsize);
dsize *= comsize;
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[SCATTER],
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[SCATTER],
dsize, &faninout, &segsize, &max_requests);
if (alg) {
@ -589,13 +575,13 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
} /* found a method */
} /*end if any com rules to check */
if (data->user_forced[SCATTER].algorithm) {
if (tuned_module->user_forced[SCATTER].algorithm) {
return ompi_coll_tuned_scatter_intra_do_forced (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
}
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
}

Просмотреть файл

@ -3,10 +3,10 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
@ -14,9 +14,9 @@
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -31,7 +31,6 @@
#include "ompi/op/op.h"
#include "coll_tuned.h"
/*
* allreduce_intra
*
@ -40,11 +39,11 @@
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
ompi_coll_tuned_allreduce_intra_dec_fixed(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
size_t dsize, block_dsize;
int comm_size = ompi_comm_size(comm);
@ -53,8 +52,8 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
/**
* Decision function based on MX results from the Grig cluster at UTK.
*
* Currently, linear, recursive doubling, and nonoverlapping algorithms
*
* Currently, linear, recursive doubling, and nonoverlapping algorithms
* can handle both commutative and non-commutative operations.
* Ring algorithm does not support non-commutative operations.
*/
@ -62,40 +61,40 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
block_dsize = dsize * (ptrdiff_t)count;
if (block_dsize < intermediate_message) {
return (ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf,
count, dtype,
op, comm, module));
}
return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
count, dtype,
op, comm, module));
}
if( ompi_op_is_commute(op) && (count > comm_size) ) {
const size_t segment_size = 1 << 20; /* 1 MB */
if (((size_t)comm_size * (size_t)segment_size >= block_dsize)) {
return (ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype,
op, comm, module));
return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype,
op, comm, module));
} else {
return (ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
count, dtype,
op, comm, module,
segment_size));
return (ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf,
count, dtype,
op, comm, module,
segment_size));
}
}
return (ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count,
dtype, op, comm, module));
return (ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count,
dtype, op, comm, module));
}
/*
* alltoall_intra_dec
* alltoall_intra_dec
*
* Function: - seletects alltoall algorithm to use
* Accepts: - same arguments as MPI_Alltoall()
* Returns: - MPI_SUCCESS or error code (passed from the bcast implementation)
*/
int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
@ -109,12 +108,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
/* special case */
if (communicator_size==2) {
return ompi_coll_tuned_alltoall_intra_two_procs(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_alltoall_intra_two_procs(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
/* Decision function based on measurement on Grig cluster at
/* Decision function based on measurement on Grig cluster at
the University of Tennessee (2GB MX) up to 64 nodes.
Has better performance for messages of intermediate sizes than the old one */
/* determine block size */
@ -123,19 +122,19 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
if ((block_dsize < (size_t) ompi_coll_tuned_alltoall_small_msg)
&& (communicator_size > 12)) {
return ompi_coll_tuned_alltoall_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
} else if (block_dsize < (size_t) ompi_coll_tuned_alltoall_intermediate_msg) {
return ompi_coll_tuned_alltoall_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
#if 0
/* previous decision */
@ -148,12 +147,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
ompi_comm_rank(comm), communicator_size, total_dsize));
if (communicator_size >= 12 && total_dsize <= 768) {
return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
}
if (total_dsize <= 131072) {
return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
}
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
#endif
}
@ -170,14 +169,14 @@ int ompi_coll_tuned_alltoallv_intra_dec_fixed(void *sbuf, int *scounts, int *sdi
mca_coll_base_module_t *module)
{
/* For starters, just keep the original algorithm. */
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps,rdtype,
comm, module);
return ompi_coll_base_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
rbuf, rcounts, rdisps,rdtype,
comm, module);
}
/*
* barrier_intra_dec
* barrier_intra_dec
*
* Function: - seletects barrier algorithm to use
* Accepts: - same arguments as MPI_Barrier()
@ -192,7 +191,7 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
communicator_size));
if( 2 == communicator_size )
return ompi_coll_tuned_barrier_intra_two_procs(comm, module);
return ompi_coll_base_barrier_intra_two_procs(comm, module);
/**
* Basic optimisation. If we have a power of 2 number of nodes
* the use the recursive doubling algorithm, otherwise
@ -203,19 +202,17 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
for( ; communicator_size > 0; communicator_size >>= 1 ) {
if( communicator_size & 0x1 ) {
if( has_one )
return ompi_coll_tuned_barrier_intra_bruck(comm, module);
return ompi_coll_base_barrier_intra_bruck(comm, module);
has_one = true;
}
}
}
return ompi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
/* return ompi_coll_tuned_barrier_intra_linear(comm); */
/* return ompi_coll_tuned_barrier_intra_doublering(comm); */
return ompi_coll_base_barrier_intra_recursivedoubling(comm, module);
}
/*
* bcast_intra_dec
* bcast_intra_dec
*
* Function: - seletects broadcast algorithm to use
* Accepts: - same arguments as MPI_Bcast()
@ -226,14 +223,14 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
/* Decision function based on MX results for
/* Decision function based on MX results for
messages up to 36MB and communicator sizes up to 64 nodes */
const size_t small_message_size = 2048;
const size_t intermediate_message_size = 370728;
const double a_p16 = 3.2118e-6; /* [1 / byte] */
const double b_p16 = 8.7936;
const double b_p16 = 8.7936;
const double a_p64 = 2.3679e-6; /* [1 / byte] */
const double b_p64 = 1.1787;
const double b_p64 = 1.1787;
const double a_p128 = 1.6134e-6; /* [1 / byte] */
const double b_p128 = 2.1102;
@ -251,95 +248,95 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
" root %d rank %d com_size %d msg_length %lu",
root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));
/* Handle messages of small and intermediate size, and
/* Handle messages of small and intermediate size, and
single-element broadcasts */
if ((message_size < small_message_size) || (count <= 1)) {
/* Binomial without segmentation */
segsize = 0;
return ompi_coll_tuned_bcast_intra_binomial (buff, count, datatype,
root, comm, module,
segsize);
return ompi_coll_base_bcast_intra_binomial(buff, count, datatype,
root, comm, module,
segsize);
} else if (message_size < intermediate_message_size) {
/* SplittedBinary with 1KB segments */
segsize = 1024;
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
root, comm, module,
segsize);
return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
root, comm, module,
segsize);
}
}
/* Handle large message sizes */
else if (communicator_size < (a_p128 * message_size + b_p128)) {
/* Pipeline with 128KB segments */
segsize = 1024 << 7;
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
root, comm, module,
segsize);
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
root, comm, module,
segsize);
} else if (communicator_size < 13) {
/* Split Binary with 8KB segments */
segsize = 1024 << 3;
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
root, comm, module,
segsize);
return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
root, comm, module,
segsize);
} else if (communicator_size < (a_p64 * message_size + b_p64)) {
/* Pipeline with 64KB segments */
segsize = 1024 << 6;
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
root, comm, module,
segsize);
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
root, comm, module,
segsize);
} else if (communicator_size < (a_p16 * message_size + b_p16)) {
/* Pipeline with 16KB segments */
segsize = 1024 << 4;
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
root, comm, module,
segsize);
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
root, comm, module,
segsize);
}
/* Pipeline with 8KB segments */
segsize = 1024 << 3;
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
root, comm, module,
segsize);
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
root, comm, module,
segsize);
#if 0
/* this is based on gige measurements */
if (communicator_size < 4) {
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
}
if (communicator_size == 4) {
if (message_size < 524288) segsize = 0;
else segsize = 16384;
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
}
if (communicator_size <= 8 && message_size < 4096) {
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
}
if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
segsize = 16384;
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
}
if (message_size >= 524288) {
segsize = 16384;
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype, root, comm, module, segsize);
}
segsize = 0;
/* once tested can swap this back in */
/* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
/* return ompi_coll_base_bcast_intra_bmtree(buff, count, datatype, root, comm, segsize); */
return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
#endif /* 0 */
}
/*
* reduce_intra_dec
* reduce_intra_dec
*
* Function: - seletects reduce algorithm to use
* Accepts: - same arguments as MPI_reduce()
* Returns: - MPI_SUCCESS or error code (passed from the reduce implementation)
*
*
*/
int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
int count, struct ompi_datatype_t* datatype,
@ -367,15 +364,15 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
message_size = dsize * (ptrdiff_t)count; /* needed for decision */
/**
* If the operation is non commutative we currently have choice of linear
* If the operation is non commutative we currently have choice of linear
* or in-order binary tree algorithm.
*/
if( !ompi_op_is_commute(op) ) {
if ((communicator_size < 12) && (message_size < 2048)) {
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
}
return ompi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
0, max_requests);
return ompi_coll_base_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
}
return ompi_coll_base_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
0, max_requests);
}
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed "
@ -384,27 +381,27 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
if ((communicator_size < 8) && (message_size < 512)){
/* Linear_0K */
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
} else if (((communicator_size < 8) && (message_size < 20480)) ||
(message_size < 2048) || (count <= 1)) {
/* Binomial_0K */
segsize = 0;
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, max_requests);
} else if (communicator_size > (a1 * message_size + b1)) {
/* Binomial_1K */
segsize = 1024;
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, max_requests);
} else if (communicator_size > (a2 * message_size + b2)) {
/* Pipeline_1K */
segsize = 1024;
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, max_requests);
return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, max_requests);
} else if (communicator_size > (a3 * message_size + b3)) {
/* Binary_32K */
segsize = 32*1024;
return ompi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
return ompi_coll_base_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
comm, module, segsize, max_requests);
}
if (communicator_size > (a4 * message_size + b4)) {
@ -414,8 +411,8 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
/* Pipeline_64K */
segsize = 64*1024;
}
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, max_requests);
return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, max_requests);
#if 0
/* for small messages use linear algorithm */
@ -424,8 +421,7 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
fanout = communicator_size - 1;
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
/* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
/* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
}
if (message_size < 524288) {
if (message_size <= 65536 ) {
@ -437,21 +433,21 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
}
/* later swap this for a binary tree */
/* fanout = 2; */
return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, fanout, max_requests);
return ompi_coll_base_reduce_intra_chain(sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, fanout, max_requests);
}
segsize = 1024;
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, max_requests);
return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
segsize, max_requests);
#endif /* 0 */
}
/*
* reduce_scatter_intra_dec
* reduce_scatter_intra_dec
*
* Function: - seletects reduce_scatter algorithm to use
* Accepts: - same arguments as MPI_Reduce_scatter()
* Returns: - MPI_SUCCESS or error code (passed from
* Returns: - MPI_SUCCESS or error code (passed from
* the reduce scatter implementation)
*/
int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
@ -474,16 +470,16 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
/* We need data size for decision function */
ompi_datatype_type_size(dtype, &dsize);
total_message_size = 0;
for (i = 0; i < comm_size; i++) {
for (i = 0; i < comm_size; i++) {
total_message_size += rcounts[i];
}
if( !ompi_op_is_commute(op) ) {
return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
dtype, op,
comm, module);
return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
dtype, op,
comm, module);
}
total_message_size *= dsize;
/* compute the nearest power of 2 */
@ -492,18 +488,18 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
if ((total_message_size <= small_message_size) ||
((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
(comm_size >= a * total_message_size + b)) {
return
ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
dtype, op,
comm, module);
}
return ompi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
return
ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
dtype, op,
comm, module);
}
return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
dtype, op,
comm, module);
}
/*
* allgather_intra_dec
* allgather_intra_dec
*
* Function: - seletects allgather algorithm to use
* Accepts: - same arguments as MPI_Allgather()
@ -511,10 +507,10 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
* internal allgather function.
*/
int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
@ -525,78 +521,78 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
/* Special case for 2 processes */
if (communicator_size == 2) {
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
/* Determine complete data size */
ompi_datatype_type_size(sdtype, &dsize);
total_dsize = dsize * (ptrdiff_t)scount * (ptrdiff_t)communicator_size;
total_dsize = dsize * (ptrdiff_t)scount * (ptrdiff_t)communicator_size;
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed"
" rank %d com_size %d msg_length %lu",
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
pow2_size = opal_next_poweroftwo_inclusive (communicator_size);
/* Decision based on MX 2Gb results from Grig cluster at
The University of Tennesse, Knoxville
- if total message size is less than 50KB use either bruck or
recursive doubling for non-power of two and power of two nodes,
/* Decision based on MX 2Gb results from Grig cluster at
The University of Tennesse, Knoxville
- if total message size is less than 50KB use either bruck or
recursive doubling for non-power of two and power of two nodes,
respectively.
- else use ring and neighbor exchange algorithms for odd and even
- else use ring and neighbor exchange algorithms for odd and even
number of nodes, respectively.
*/
if (total_dsize < 50000) {
if (pow2_size == communicator_size) {
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
} else {
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
} else {
if (communicator_size % 2) {
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
} else {
return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_allgather_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
}
#if defined(USE_MPICH2_DECISION)
/* Decision as in MPICH-2
presented in Thakur et.al. "Optimization of Collective Communication
Operations in MPICH", International Journal of High Performance Computing
/* Decision as in MPICH-2
presented in Thakur et.al. "Optimization of Collective Communication
Operations in MPICH", International Journal of High Performance Computing
Applications, Vol. 19, No. 1, 49-66 (2005)
- for power-of-two processes and small and medium size messages
- for power-of-two processes and small and medium size messages
(up to 512KB) use recursive doubling
- for non-power-of-two processes and small messages (80KB) use bruck,
- for everything else use ring.
*/
if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
} else if (total_dsize <= 81920) {
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
} else if (total_dsize <= 81920) {
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
}
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
#endif /* defined(USE_MPICH2_DECISION) */
}
/*
* allgatherv_intra_dec
* allgatherv_intra_dec
*
* Function: - seletects allgatherv algorithm to use
* Accepts: - same arguments as MPI_Allgatherv()
@ -604,59 +600,59 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
* internal allgatherv function.
*/
int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int *rcounts,
void* rbuf, int *rcounts,
int *rdispls,
struct ompi_datatype_t *rdtype,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i;
int communicator_size;
size_t dsize, total_dsize;
communicator_size = ompi_comm_size(comm);
/* Special case for 2 processes */
if (communicator_size == 2) {
return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
}
/* Determine complete data size */
ompi_datatype_type_size(sdtype, &dsize);
total_dsize = 0;
for (i = 0; i < communicator_size; i++) {
total_dsize += dsize * (ptrdiff_t)rcounts[i];
}
OPAL_OUTPUT((ompi_coll_tuned_stream,
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_allgatherv_intra_dec_fixed"
" rank %d com_size %d msg_length %lu",
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
/* Decision based on allgather decision. */
if (total_dsize < 50000) {
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
} else {
if (communicator_size % 2) {
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
} else {
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
return ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm, module);
}
}
}
/*
* gather_intra_dec
* gather_intra_dec
*
* Function: - seletects gather algorithm to use
* Accepts: - same arguments as MPI_Gather()
@ -664,10 +660,10 @@ int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
* internal allgather function.
*/
int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
@ -685,7 +681,7 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
int communicator_size, rank;
size_t dsize, block_size;
OPAL_OUTPUT((ompi_coll_tuned_stream,
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_gather_intra_dec_fixed"));
communicator_size = ompi_comm_size(comm);
@ -701,33 +697,32 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
}
if (block_size > large_block_size) {
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
large_segment_size);
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
large_segment_size);
} else if (block_size > intermediate_block_size) {
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
small_segment_size);
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
small_segment_size);
} else if ((communicator_size > large_communicator_size) ||
((communicator_size > small_communicator_size) &&
(block_size < small_block_size))) {
return ompi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
}
/* Otherwise, use basic linear */
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
}
/*
* scatter_intra_dec
* scatter_intra_dec
*
* Function: - seletects scatter algorithm to use
* Accepts: - same arguments as MPI_Scatter()
@ -735,10 +730,10 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
* internal allgather function.
*/
int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
@ -747,7 +742,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
int communicator_size, rank;
size_t dsize, block_size;
OPAL_OUTPUT((ompi_coll_tuned_stream,
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_scatter_intra_dec_fixed"));
communicator_size = ompi_comm_size(comm);
@ -759,15 +754,15 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
} else {
ompi_datatype_type_size(rdtype, &dsize);
block_size = dsize * (ptrdiff_t)rcount;
}
}
if ((communicator_size > small_comm_size) &&
(block_size < small_block_size)) {
return ompi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
}
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
}

Просмотреть файл

@ -1,21 +1,20 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -28,7 +27,7 @@
#include "coll_tuned.h"
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_tuned_topo.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
/* also need the dynamic rule structures */
#include "coll_tuned_dynamic_rules.h"
@ -43,7 +42,7 @@ static long getnext (FILE *fptr); /* local function */
static int fileline=0; /* used for verbose error messages */
/*
/*
* Reads a rule file called fname
* Builds the algorithm rule table for a max of n_collectives
*
@ -97,6 +96,10 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
/* make space and init the algorithm rules for each of the n_collectives MPI collectives */
alg_rules = ompi_coll_tuned_mk_alg_rules (n_collectives);
if (NULL == alg_rules) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"cannot cannot allocate rules for file [%s]\n", fname));
goto on_file_error;
}
if (NULL == alg_rules) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"cannot cannot allocate rules for file [%s]\n", fname));
@ -127,10 +130,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
if (alg_rules[CI].alg_rule_id != CI) {
OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI));
fclose(fptr);
ompi_coll_tuned_free_all_rules (alg_rules, n_collectives);
*rules = (ompi_coll_alg_rule_t*) NULL;
return (-4);
goto on_file_error;
}
OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %d\n", CI));
alg_p = &alg_rules[CI];
@ -151,7 +151,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
for (ncs=0;ncs<NCS;ncs++) { /* for each comm size */
com_p = &(alg_p->com_rules[ncs]);
CS = (int)getnext (fptr);
if (CS<0) {
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
@ -165,7 +165,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
goto on_file_error;
}
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n",
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n",
NMS, CI, CS));
com_p->n_msg_sizes = NMS;
com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS);
@ -222,7 +222,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %d\n", CI));
} /* per collective */
fclose (fptr);
OPAL_OUTPUT((ompi_coll_tuned_stream,"\nConfigure file Stats\n"));
@ -291,4 +291,3 @@ static long getnext (FILE *fptr)
if ('#' == trash) skiptonewline (fptr);
} while (1);
}

Просмотреть файл

@ -2,18 +2,18 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2011-2012 FUJITSU LIMITED. All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -25,7 +25,7 @@
#include "coll_tuned.h"
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_tuned_topo.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
/* also need the dynamic rule structures */
#include "coll_tuned_dynamic_rules.h"
@ -33,7 +33,7 @@
#include <stdlib.h>
#include <stdio.h>
#include "coll_tuned_util.h"
#include "ompi/mca/coll/base/coll_base_util.h"
ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
@ -43,7 +43,7 @@ ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
alg_rules = (ompi_coll_alg_rule_t *) calloc (n_alg, sizeof (ompi_coll_alg_rule_t));
if (!alg_rules) return (alg_rules);
/* set all we can at this point */
for (i=0;i<n_alg;i++) {
alg_rules[i].alg_rule_id = i;
@ -52,7 +52,7 @@ ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
}
ompi_coll_com_rule_t* ompi_coll_tuned_mk_com_rules (int n_com_rules, int alg_rule_id)
ompi_coll_com_rule_t* ompi_coll_tuned_mk_com_rules (int n_com_rules, int alg_rule_id)
{
int i;
ompi_coll_com_rule_t * com_rules;
@ -95,9 +95,9 @@ ompi_coll_msg_rule_t* ompi_coll_tuned_mk_msg_rules (int n_msg_rules, int alg_rul
/*
* Debug / IO routines
* Debug / IO routines
*
*/
*/
int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p)
{
if (!msg_p) {
@ -105,11 +105,11 @@ int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p)
return (-1);
}
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\tcom_id %3d\tcom_size %3d\tmsg_id %3d\t", msg_p->alg_rule_id,
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\tcom_id %3d\tcom_size %3d\tmsg_id %3d\t", msg_p->alg_rule_id,
msg_p->com_rule_id, msg_p->mpi_comsize, msg_p->msg_rule_id));
OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %10lu -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\tmax_requests %4d\n",
msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize,
OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %10lu -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\tmax_requests %4d\n",
msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize,
msg_p->result_max_requests));
return (0);
@ -268,7 +268,7 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
return (rc);
}
/*
/*
* query functions
* i.e. the functions that get me the algorithm, topo fanin/out and segment size fast
* and also get the rules that are needed by each communicator as needed
@ -277,7 +277,7 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
/*
* This function is used to get the pointer to the nearest (less than or equal)
* com rule for this MPI collective (alg_id) for a given
* com rule for this MPI collective (alg_id) for a given
* MPI communicator size. The complete rule base must be presented.
*
* If no rule exits returns NULL, else the com rule ptr
@ -302,7 +302,7 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
}
/* ok have some com sizes, now to find the one closest to my mpi_comsize */
/* make a copy of the first com rule */
best_com_p = com_p = alg_p->com_rules;
i = best = 0;
@ -324,13 +324,13 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
return (best_com_p);
}
/*
* This function takes a com_rule ptr (from the communicators coll tuned data structure)
/*
* This function takes a com_rule ptr (from the communicators coll tuned data structure)
* (Which is chosen for a particular MPI collective)
* and a (total_)msg_size and it returns (0) and a algorithm to use and a recommended topo faninout and segment size
* all based on the user supplied rules
*
* Just like the above functions it uses a less than or equal msg size
* Just like the above functions it uses a less than or equal msg size
* (hense config file must have a default defined for '0' if we reach this point)
* else if no rules match we return '0' + '0,0' or used fixed decision table with no topo chand and no segmentation
* of users data.. shame.
@ -339,7 +339,7 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
*
*/
int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, size_t mpi_msgsize, int *result_topo_faninout,
int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, size_t mpi_msgsize, int *result_topo_faninout,
int* result_segsize, int* max_requests)
{
ompi_coll_msg_rule_t* msg_p = (ompi_coll_msg_rule_t*) NULL;
@ -352,7 +352,7 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul
}
/* ok have some msg sizes, now to find the one closest to my mpi_msgsize */
/* make a copy of the first msg rule */
best_msg_p = msg_p = base_com_rule->msg_rules;
i = best = 0;
@ -387,6 +387,5 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul
*max_requests = best_msg_p->result_max_requests;
/* return the algorithm/method to use */
return (best_msg_p->result_alg);
return (best_msg_p->result_alg);
}

Просмотреть файл

@ -0,0 +1,198 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_base_util.h"
/* gather algorithm variables */
static int coll_tuned_gather_forced_algorithm = 0;
static int coll_tuned_gather_segment_size = 0;
static int coll_tuned_gather_tree_fanout;
static int coll_tuned_gather_chain_fanout;
/* valid values for coll_tuned_gather_forced_algorithm */
static mca_base_var_enum_value_t gather_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "binomial"},
{3, "linear_sync"},
{0, NULL}
};
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != gather_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[GATHER] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_count",
"Number of gather algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_gather_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_gather_algorithms", gather_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm",
"Which gather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 linear with synchronization.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_gather_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_segmentsize",
"Segment size in bytes used by default for gather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_segment_size);
coll_tuned_gather_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_tree_fanout",
"Fanout for n-tree used for gather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_tree_fanout);
coll_tuned_gather_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"gather_algorithm_chain_fanout",
"Fanout for chains used for gather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_gather_chain_fanout);
return (MPI_SUCCESS);
}
int
ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_forced selected algorithm %d",
tuned_module->user_forced[GATHER].algorithm));
switch (tuned_module->user_forced[GATHER].algorithm) {
case (0):
return ompi_coll_tuned_gather_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (3):
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
tuned_module->user_forced[GATHER].segsize);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[GATHER].algorithm,
ompi_coll_tuned_forced_max_algorithms[GATHER]));
return (MPI_ERR_ARG);
}
int
ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_gather_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (3):
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
segsize);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[GATHER]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -2,18 +2,18 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
*
* Additional copyrights may follow
*
*
* $HEADER$
*/
@ -26,13 +26,13 @@
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/base.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_dynamic_rules.h"
#include "coll_tuned_dynamic_file.h"
static int tuned_module_enable(mca_coll_base_module_t *module,
struct ompi_communicator_t *comm);
struct ompi_communicator_t *comm);
/*
* Initial query function that is invoked during MPI_INIT, allowing
* this component to disqualify itself if it doesn't support the
@ -79,8 +79,8 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority)
*priority = ompi_coll_tuned_priority;
/*
* Choose whether to use [intra|inter] decision functions
/*
* Choose whether to use [intra|inter] decision functions
* and if using fixed OR dynamic rule sets.
* Right now you cannot mix them, maybe later on it can be changed
* but this would probably add an extra if and funct call to the path
@ -114,9 +114,9 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority)
/* We put all routines that handle the MCA user forced algorithm and parameter choices here */
/* recheck the setting of forced, called on module create (i.e. for each new comm) */
static int
ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
coll_tuned_force_algorithm_params_t *forced_values )
{
coll_tuned_force_algorithm_mca_param_indices_t* mca_params;
@ -145,20 +145,20 @@ ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
return (MPI_SUCCESS);
}
#define COLL_TUNED_EXECUTE_IF_DYNAMIC(DATA, TYPE, EXECUTE) \
#define COLL_TUNED_EXECUTE_IF_DYNAMIC(TMOD, TYPE, EXECUTE) \
{ \
int need_dynamic_decision = 0; \
ompi_coll_tuned_forced_getvalues( (TYPE), &((DATA)->user_forced[(TYPE)]) ); \
(DATA)->com_rules[(TYPE)] = NULL; \
if( 0 != (DATA)->user_forced[(TYPE)].algorithm ) { \
ompi_coll_tuned_forced_getvalues( (TYPE), &((TMOD)->user_forced[(TYPE)]) ); \
(TMOD)->com_rules[(TYPE)] = NULL; \
if( 0 != (TMOD)->user_forced[(TYPE)].algorithm ) { \
need_dynamic_decision = 1; \
EXECUTE; \
} \
if( NULL != mca_coll_tuned_component.all_base_rules ) { \
(DATA)->com_rules[(TYPE)] \
(TMOD)->com_rules[(TYPE)] \
= ompi_coll_tuned_get_com_rule_ptr( mca_coll_tuned_component.all_base_rules, \
(TYPE), size ); \
if( NULL != (DATA)->com_rules[(TYPE)] ) { \
if( NULL != (TMOD)->com_rules[(TYPE)] ) { \
need_dynamic_decision = 1; \
} \
} \
@ -178,7 +178,7 @@ tuned_module_enable( mca_coll_base_module_t *module,
{
int size;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t *) module;
mca_coll_tuned_comm_t *data = NULL;
mca_coll_base_comm_t *data = NULL;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called."));
@ -191,32 +191,19 @@ tuned_module_enable( mca_coll_base_module_t *module,
/**
* we still malloc data as it is used by the TUNED modules
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
* we place any special info after the default data
*
* BUT on very large systems we might not be able to allocate all this memory so
* we do check a MCA parameter to see if if we should allocate this memory
*
* The default is set very high
*
* The default is set very high
*/
/* if we within the memory/size limit, allow preallocated data */
if( size <= ompi_coll_tuned_preallocate_memory_comm_size_limit ) {
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t) +
(sizeof(ompi_request_t *) * size * 2));
if (NULL == data) {
return OMPI_ERROR;
}
data->mcct_reqs = (ompi_request_t **) (data + 1);
data->mcct_num_reqs = size * 2;
} else {
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t));
if (NULL == data) {
return OMPI_ERROR;
}
data->mcct_reqs = (ompi_request_t **) NULL;
data->mcct_num_reqs = 0;
data = OBJ_NEW(mca_coll_base_comm_t);
if (NULL == data) {
return OMPI_ERROR;
}
if (ompi_coll_tuned_use_dynamic_rules) {
@ -230,37 +217,37 @@ tuned_module_enable( mca_coll_base_module_t *module,
* next dynamic state, recheck all forced rules as well
* warning, we should check to make sure this is really an INTRA comm here...
*/
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLGATHER,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLGATHER,
tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLGATHERV,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLGATHERV,
tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLREDUCE,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLREDUCE,
tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALL,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLTOALL,
tuned_module->super.coll_alltoall = ompi_coll_tuned_alltoall_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALLV,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLTOALLV,
tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALLW,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLTOALLW,
tuned_module->super.coll_alltoallw = NULL);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, BARRIER,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, BARRIER,
tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, BCAST,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, BCAST,
tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, EXSCAN,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, EXSCAN,
tuned_module->super.coll_exscan = NULL);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, GATHER,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, GATHER,
tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, GATHERV,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, GATHERV,
tuned_module->super.coll_gatherv = NULL);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, REDUCE,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, REDUCE,
tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, REDUCESCATTER,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, REDUCESCATTER,
tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCAN,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCAN,
tuned_module->super.coll_scan = NULL);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCATTER,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCATTER,
tuned_module->super.coll_scatter = ompi_coll_tuned_scatter_intra_dec_dynamic);
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCATTERV,
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCATTERV,
tuned_module->super.coll_scatterv = NULL);
if( false == ompi_coll_tuned_use_dynamic_rules ) {
@ -269,7 +256,7 @@ tuned_module_enable( mca_coll_base_module_t *module,
" decision by lack of dynamic rules"));
}
}
/* general n fan out tree */
data->cached_ntree = NULL;
/* binary tree */
@ -286,7 +273,7 @@ tuned_module_enable( mca_coll_base_module_t *module,
data->cached_in_order_bintree = NULL;
/* All done */
tuned_module->tuned_data = data;
tuned_module->super.base_data = data;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use"));
return OMPI_SUCCESS;

Просмотреть файл

@ -0,0 +1,222 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "coll_tuned.h"
/* reduce algorithm variables */
static int coll_tuned_reduce_forced_algorithm = 0;
static int coll_tuned_reduce_segment_size = 0;
static int coll_tuned_reduce_max_requests;
static int coll_tuned_reduce_tree_fanout;
static int coll_tuned_reduce_chain_fanout;
/* valid values for coll_tuned_reduce_forced_algorithm */
static mca_base_var_enum_value_t reduce_algorithms[] = {
{0, "ignore"},
{1, "linear"},
{2, "chain"},
{3, "pipeline"},
{4, "binary"},
{5, "binomial"},
{6, "in-order_binary"},
{0, NULL}
};
/**
* The following are used by dynamic and forced rules
*
* publish details of each algorithm and if its forced/fixed/locked in
* as you add methods/algorithms you must update this and the query/map routines
*
* this routine is called by the component only
* this makes sure that the mca parameters are set to their initial values and
* perms module does not call this they call the forced_getvalues routine
* instead.
*/
int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t*new_enum;
int cnt;
for( cnt = 0; NULL != reduce_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[REDUCE] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_count",
"Number of reduce algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_reduce_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_reduce_algorithms", reduce_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm",
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_reduce_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_segmentsize",
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_segment_size);
coll_tuned_reduce_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_tree_fanout",
"Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_tree_fanout);
coll_tuned_reduce_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_chain_fanout",
"Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_chain_fanout);
coll_tuned_reduce_max_requests = 0; /* no limit for reduce by default */
mca_param_indices->max_requests_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_algorithm_max_requests",
"Maximum number of outstanding send requests on leaf nodes. 0 means no limit.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_max_requests);
if (mca_param_indices->max_requests_param_index < 0) {
return mca_param_indices->max_requests_param_index;
}
if (coll_tuned_reduce_max_requests < 0) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Maximum outstanding requests must be positive number or 0. Initializing to 0 (no limit).\n" );
}
coll_tuned_reduce_max_requests = 0;
}
return (MPI_SUCCESS);
}
int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
const int segsize = tuned_module->user_forced[REDUCE].segsize;
const int chain_fanout = tuned_module->user_forced[REDUCE].chain_fanout;
const int max_requests = tuned_module->user_forced[REDUCE].max_requests;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d",
tuned_module->user_forced[REDUCE].algorithm));
switch (tuned_module->user_forced[REDUCE].algorithm) {
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed(sbuf, rbuf, count, dtype,
op, root, comm, module);
case (1): return ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
op, root, comm, module);
case (2): return ompi_coll_base_reduce_intra_chain(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, chain_fanout, max_requests);
case (3): return ompi_coll_base_reduce_intra_pipeline(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (4): return ompi_coll_base_reduce_intra_binary(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (5): return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (6): return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout,
int segsize, int max_requests )
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed(sbuf, rbuf, count, dtype,
op, root, comm, module);
case (1): return ompi_coll_base_reduce_intra_basic_linear(sbuf, rbuf, count, dtype,
op, root, comm, module);
case (2): return ompi_coll_base_reduce_intra_chain(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, faninout, max_requests);
case (3): return ompi_coll_base_reduce_intra_pipeline(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (4): return ompi_coll_base_reduce_intra_binary(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (5): return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
case (6): return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
op, root, comm, module,
segsize, max_requests);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -0,0 +1,173 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "opal/util/bit_ops.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/op/op.h"
#include "coll_tuned.h"
/* reduce_scatter algorithm variables */
static int coll_tuned_reduce_scatter_forced_algorithm = 0;
static int coll_tuned_reduce_scatter_segment_size = 0;
static int coll_tuned_reduce_scatter_tree_fanout;
static int coll_tuned_reduce_scatter_chain_fanout;
/* valid values for coll_tuned_reduce_scatter_forced_algorithm */
static mca_base_var_enum_value_t reduce_scatter_algorithms[] = {
{0, "ignore"},
{1, "non-overlapping"},
{2, "recursive_halfing"},
{3, "ring"},
{0, NULL}
};
/**
* The following are used by dynamic and forced rules
*
* publish details of each algorithm and if its forced/fixed/locked in
* as you add methods/algorithms you must update this and the query/map routines
*
* this routine is called by the component only
* this makes sure that the mca parameters are set to their initial values and
* perms module does not call this they call the forced_getvalues routine
* instead
*/
int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != reduce_scatter_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_count",
"Number of reduce_scatter algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_reduce_scatter_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_reduce_scatter_algorithms", reduce_scatter_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm",
"Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_reduce_scatter_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_segmentsize",
"Segment size in bytes used by default for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_segment_size);
coll_tuned_reduce_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_tree_fanout",
"Fanout for n-tree used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_tree_fanout);
coll_tuned_reduce_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"reduce_scatter_algorithm_chain_fanout",
"Fanout for chains used for reduce_scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_reduce_scatter_chain_fanout);
return (MPI_SUCCESS);
}
int ompi_coll_tuned_reduce_scatter_intra_do_forced(void *sbuf, void* rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced selected algorithm %d",
tuned_module->user_forced[REDUCESCATTER].algorithm));
switch (tuned_module->user_forced[REDUCESCATTER].algorithm) {
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (1): return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (2): return ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (3): return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
dtype, op, comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
return (MPI_ERR_ARG);
}
int ompi_coll_tuned_reduce_scatter_intra_do_this(void *sbuf, void* rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (1): return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (2): return ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
dtype, op, comm, module);
case (3): return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
dtype, op, comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
return (MPI_ERR_ARG);
}

Просмотреть файл

@ -1,421 +0,0 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
#include "coll_tuned_util.h"
/* scatter algorithm variables */
static int coll_tuned_scatter_algorithm_count = 2;
static int coll_tuned_scatter_forced_algorithm = 0;
static int coll_tuned_scatter_segment_size = 0;
static int coll_tuned_scatter_tree_fanout;
static int coll_tuned_scatter_chain_fanout;
/* valid values for coll_tuned_scatter_forced_algorithm */
static mca_base_var_enum_value_t scatter_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "binomial"},
{0, NULL}
};
int
ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int line = -1, i, rank, vrank, size, total_send = 0, err;
char *ptmp, *tempbuf = NULL;
ompi_coll_tree_t* bmtree;
MPI_Status status;
MPI_Aint sextent, slb, strue_lb, strue_extent;
MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
OPAL_OUTPUT((ompi_coll_tuned_stream,
"ompi_coll_tuned_scatter_intra_binomial rank %d", rank));
/* create the binomial tree */
COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
bmtree = data->cached_in_order_bmtree;
ompi_datatype_get_extent(sdtype, &slb, &sextent);
ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent);
ompi_datatype_get_extent(rdtype, &rlb, &rextent);
ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent);
vrank = (rank - root + size) % size;
ptmp = (char *) rbuf; /* by default suppose leaf nodes, just use rbuf */
if (rank == root) {
if (0 == root) {
/* root on 0, just use the send buffer */
ptmp = (char *) sbuf;
if (rbuf != MPI_IN_PLACE) {
/* local copy to rbuf */
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
rbuf, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
} else {
/* root is not on 0, allocate temp buffer for send */
tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent);
if (NULL == tempbuf) {
err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
}
ptmp = tempbuf - strue_lb;
/* and rotate data so they will eventually in the right place */
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root),
ptmp, (char *) sbuf + sextent * (ptrdiff_t)root * (ptrdiff_t)scount);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)root,
ptmp + sextent * (ptrdiff_t)scount * (ptrdiff_t)(size - root), (char *)sbuf);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
if (rbuf != MPI_IN_PLACE) {
/* local copy to rbuf */
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
rbuf, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
}
total_send = scount;
} else if (!(vrank % 2)) {
/* non-root, non-leaf nodes, allocte temp buffer for recv
* the most we need is rcount*size/2 */
tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent);
if (NULL == tempbuf) {
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
}
ptmp = tempbuf - rtrue_lb;
sdtype = rdtype;
scount = rcount;
sextent = rextent;
total_send = scount;
}
if (!(vrank % 2)) {
if (rank != root) {
/* recv from parent on non-root */
err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)rcount * (ptrdiff_t)size, rdtype, bmtree->tree_prev,
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
/* local copy to rbuf */
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
rbuf, rcount, rdtype);
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
/* send to children on all non-leaf */
for (i = 0; i < bmtree->tree_nextsize; i++) {
size_t mycount = 0;
int vkid;
/* figure out how much data I have to send to this child */
vkid = (bmtree->tree_next[i] - root + size) % size;
mycount = vkid - vrank;
if( (int)mycount > (size - vkid) )
mycount = size - vkid;
mycount *= scount;
err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)total_send * sextent, mycount, sdtype,
bmtree->tree_next[i],
MCA_COLL_BASE_TAG_SCATTER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
total_send += mycount;
}
if (NULL != tempbuf)
free(tempbuf);
} else {
/* recv from parent on leaf nodes */
err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, bmtree->tree_prev,
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}
return MPI_SUCCESS;
err_hndl:
if (NULL != tempbuf)
free(tempbuf);
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
return err;
}
/*
* Linear functions are copied from the BASIC coll module
* they do not segment the message and are simple implementations
* but for some small number of nodes and/or small data sizes they
* are just as fast as tuned/tree based segmenting operations
* and as such may be selected by the decision functions
* These are copied into this module due to the way we select modules
* in V1. i.e. in V2 we will handle this differently and so will not
* have to duplicate code.
* JPG following the examples from other coll_tuned implementations. Dec06.
*/
/* copied function (with appropriate renaming) starts here */
/*
* scatter_intra
*
* Function: - basic scatter operation
* Accepts: - same arguments as MPI_Scatter()
* Returns: - MPI_SUCCESS or error code
*/
int
ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, err;
ptrdiff_t lb, incr;
char *ptmp;
/* Initialize */
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
/* If not root, receive data. */
if (rank != root) {
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
MCA_COLL_BASE_TAG_SCATTER,
comm, MPI_STATUS_IGNORE));
return err;
}
/* I am the root, loop sending data. */
err = ompi_datatype_get_extent(sdtype, &lb, &incr);
if (OMPI_SUCCESS != err) {
return OMPI_ERROR;
}
incr *= scount;
for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {
/* simple optimization */
if (i == rank) {
if (MPI_IN_PLACE != rbuf) {
err =
ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
rdtype);
}
} else {
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
MCA_COLL_BASE_TAG_SCATTER,
MCA_PML_BASE_SEND_STANDARD, comm));
}
if (MPI_SUCCESS != err) {
return err;
}
}
/* All done */
return MPI_SUCCESS;
}
/* copied function (with appropriate renaming) ends here */
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
ompi_coll_tuned_forced_max_algorithms[SCATTER] = coll_tuned_scatter_algorithm_count;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_count",
"Number of scatter algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&coll_tuned_scatter_algorithm_count);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_scatter_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_scatter_algorithms", scatter_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm",
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_scatter_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_segmentsize",
"Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_segment_size);
coll_tuned_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_tree_fanout",
"Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_tree_fanout);
coll_tuned_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index=
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_chain_fanout",
"Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_chain_fanout);
return (MPI_SUCCESS);
}
int
ompi_coll_tuned_scatter_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_forced selected algorithm %d",
data->user_forced[SCATTER].algorithm));
switch (data->user_forced[SCATTER].algorithm) {
case (0):
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
data->user_forced[SCATTER].algorithm,
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
return (MPI_ERR_ARG);
} /* switch */
}
int
ompi_coll_tuned_scatter_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
default:
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm,
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
return (MPI_ERR_ARG);
} /* switch */
}

Просмотреть файл

@ -0,0 +1,185 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/mca/coll/base/coll_base_topo.h"
#include "ompi/mca/coll/base/coll_base_util.h"
#include "ompi/mca/pml/pml.h"
#include "coll_tuned.h"
/* scatter algorithm variables */
static int coll_tuned_scatter_forced_algorithm = 0;
static int coll_tuned_scatter_segment_size = 0;
static int coll_tuned_scatter_tree_fanout;
static int coll_tuned_scatter_chain_fanout;
/* valid values for coll_tuned_scatter_forced_algorithm */
static mca_base_var_enum_value_t scatter_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "binomial"},
{0, NULL}
};
/* The following are used by dynamic and forced rules */
/* publish details of each algorithm and if its forced/fixed/locked in */
/* as you add methods/algorithms you must update this and the query/map
routines */
/* this routine is called by the component only */
/* this makes sure that the mca parameters are set to their initial values
and perms */
/* module does not call this they call the forced_getvalues routine instead */
int
ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices)
{
mca_base_var_enum_t *new_enum;
int cnt;
for( cnt = 0; NULL != scatter_algorithms[cnt].string; cnt++ );
ompi_coll_tuned_forced_max_algorithms[SCATTER] = cnt;
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_count",
"Number of scatter algorithms available",
MCA_BASE_VAR_TYPE_INT, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_CONSTANT,
&cnt);
/* MPI_T: This variable should eventually be bound to a communicator */
coll_tuned_scatter_forced_algorithm = 0;
(void) mca_base_var_enum_create("coll_tuned_scatter_algorithms", scatter_algorithms, &new_enum);
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm",
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_forced_algorithm);
OBJ_RELEASE(new_enum);
if (mca_param_indices->algorithm_param_index < 0) {
return mca_param_indices->algorithm_param_index;
}
coll_tuned_scatter_segment_size = 0;
mca_param_indices->segsize_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_segmentsize",
"Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_segment_size);
coll_tuned_scatter_tree_fanout = ompi_coll_tuned_init_tree_fanout; /* get system wide default */
mca_param_indices->tree_fanout_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_tree_fanout",
"Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_tree_fanout);
coll_tuned_scatter_chain_fanout = ompi_coll_tuned_init_chain_fanout; /* get system wide default */
mca_param_indices->chain_fanout_param_index=
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_chain_fanout",
"Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&coll_tuned_scatter_chain_fanout);
return (MPI_SUCCESS);
}
int
ompi_coll_tuned_scatter_intra_do_forced(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_forced selected algorithm %d",
tuned_module->user_forced[SCATTER].algorithm));
switch (tuned_module->user_forced[SCATTER].algorithm) {
case (0):
return ompi_coll_tuned_scatter_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
tuned_module->user_forced[SCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[SCATTER]));
return MPI_ERR_ARG;
}
int
ompi_coll_tuned_scatter_intra_do_this(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int algorithm, int faninout, int segsize)
{
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
algorithm, faninout, segsize));
switch (algorithm) {
case (0):
return ompi_coll_tuned_scatter_intra_dec_fixed(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (1):
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (2):
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
algorithm, ompi_coll_tuned_forced_max_algorithms[SCATTER]));
return MPI_ERR_ARG;
}