Spring cleanup. Nothing important.
This commit was SVN r26247.
Этот коммит содержится в:
родитель
654c75ff24
Коммит
f09e3ce5a4
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -90,114 +90,111 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount,
|
|||||||
int *rdispls,
|
int *rdispls,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int line = -1, err = 0;
|
int line = -1, err = 0, rank, size, sendto, recvfrom, distance, blockcount, i;
|
||||||
int rank, size;
|
int *new_rcounts = NULL, *new_rdispls = NULL, *new_scounts = NULL, *new_sdispls = NULL;
|
||||||
int sendto, recvfrom, distance, blockcount, i;
|
ptrdiff_t slb, rlb, sext, rext;
|
||||||
int *new_rcounts = NULL, *new_rdispls = NULL;
|
char *tmpsend = NULL, *tmprecv = NULL;
|
||||||
int *new_scounts = NULL, *new_sdispls = NULL;
|
struct ompi_datatype_t *new_rdtype, *new_sdtype;
|
||||||
ptrdiff_t slb, rlb, sext, rext;
|
|
||||||
char *tmpsend = NULL, *tmprecv = NULL;
|
|
||||||
struct ompi_datatype_t *new_rdtype, *new_sdtype;
|
|
||||||
|
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:allgather_intra_bruck rank %d", rank));
|
"coll:tuned:allgather_intra_bruck rank %d", rank));
|
||||||
|
|
||||||
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
|
err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
/* Initialization step:
|
/* Initialization step:
|
||||||
- if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
|
- if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of
|
||||||
the receive buffer.
|
the receive buffer.
|
||||||
*/
|
*/
|
||||||
tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext;
|
tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext;
|
||||||
if (MPI_IN_PLACE != sbuf) {
|
if (MPI_IN_PLACE != sbuf) {
|
||||||
tmpsend = (char*) sbuf;
|
tmpsend = (char*) sbuf;
|
||||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
||||||
tmprecv, rcounts[rank], rdtype);
|
tmprecv, rcounts[rank], rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Communication step:
|
/* Communication step:
|
||||||
At every step i, rank r:
|
At every step i, rank r:
|
||||||
- doubles the distance
|
- doubles the distance
|
||||||
- sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
|
- sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i])
|
||||||
to rank (r - distance)
|
to rank (r - distance)
|
||||||
- receives message of blockcount blocks,
|
- receives message of blockcount blocks,
|
||||||
(rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
|
(rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from
|
||||||
rank (r + distance)
|
rank (r + distance)
|
||||||
- blockcount doubles until the last step when only the remaining data is
|
- blockcount doubles until the last step when only the remaining data is
|
||||||
exchanged.
|
exchanged.
|
||||||
*/
|
*/
|
||||||
blockcount = 1;
|
blockcount = 1;
|
||||||
tmpsend = (char*) rbuf;
|
tmpsend = (char*) rbuf;
|
||||||
|
|
||||||
new_rcounts = (int*) calloc(4*size, sizeof(int));
|
new_rcounts = (int*) calloc(4*size, sizeof(int));
|
||||||
if (NULL == new_rcounts) { err = -1; line = __LINE__; goto err_hndl; }
|
if (NULL == new_rcounts) { err = -1; line = __LINE__; goto err_hndl; }
|
||||||
new_rdispls = new_rcounts + size;
|
new_rdispls = new_rcounts + size;
|
||||||
new_scounts = new_rdispls + size;
|
new_scounts = new_rdispls + size;
|
||||||
new_sdispls = new_scounts + size;
|
new_sdispls = new_scounts + size;
|
||||||
|
|
||||||
for (distance = 1; distance < size; distance<<=1) {
|
for (distance = 1; distance < size; distance<<=1) {
|
||||||
|
|
||||||
recvfrom = (rank + distance) % size;
|
recvfrom = (rank + distance) % size;
|
||||||
sendto = (rank - distance + size) % size;
|
sendto = (rank - distance + size) % size;
|
||||||
|
|
||||||
if (distance <= (size >> 1)) {
|
if (distance <= (size >> 1)) {
|
||||||
blockcount = distance;
|
blockcount = distance;
|
||||||
} else {
|
} else {
|
||||||
blockcount = size - distance;
|
blockcount = size - distance;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* create send and receive datatypes */
|
/* create send and receive datatypes */
|
||||||
for (i = 0; i < blockcount; i++) {
|
for (i = 0; i < blockcount; i++) {
|
||||||
const int tmp_srank = (rank + i) % size;
|
const int tmp_srank = (rank + i) % size;
|
||||||
const int tmp_rrank = (recvfrom + i) % size;
|
const int tmp_rrank = (recvfrom + i) % size;
|
||||||
new_scounts[i] = rcounts[tmp_srank];
|
new_scounts[i] = rcounts[tmp_srank];
|
||||||
new_sdispls[i] = rdispls[tmp_srank];
|
new_sdispls[i] = rdispls[tmp_srank];
|
||||||
new_rcounts[i] = rcounts[tmp_rrank];
|
new_rcounts[i] = rcounts[tmp_rrank];
|
||||||
new_rdispls[i] = rdispls[tmp_rrank];
|
new_rdispls[i] = rdispls[tmp_rrank];
|
||||||
}
|
}
|
||||||
err = ompi_datatype_create_indexed(blockcount, new_scounts, new_sdispls,
|
err = ompi_datatype_create_indexed(blockcount, new_scounts, new_sdispls,
|
||||||
rdtype, &new_sdtype);
|
rdtype, &new_sdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
err = ompi_datatype_create_indexed(blockcount, new_rcounts, new_rdispls,
|
err = ompi_datatype_create_indexed(blockcount, new_rcounts, new_rdispls,
|
||||||
rdtype, &new_rdtype);
|
rdtype, &new_rdtype);
|
||||||
|
|
||||||
err = ompi_datatype_commit(&new_sdtype);
|
err = ompi_datatype_commit(&new_sdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
err = ompi_datatype_commit(&new_rdtype);
|
err = ompi_datatype_commit(&new_rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
/* Sendreceive */
|
/* Sendreceive */
|
||||||
err = ompi_coll_tuned_sendrecv(rbuf, 1, new_sdtype, sendto,
|
err = ompi_coll_tuned_sendrecv(rbuf, 1, new_sdtype, sendto,
|
||||||
MCA_COLL_BASE_TAG_ALLGATHERV,
|
MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||||
rbuf, 1, new_rdtype, recvfrom,
|
rbuf, 1, new_rdtype, recvfrom,
|
||||||
MCA_COLL_BASE_TAG_ALLGATHERV,
|
MCA_COLL_BASE_TAG_ALLGATHERV,
|
||||||
comm, MPI_STATUS_IGNORE, rank);
|
comm, MPI_STATUS_IGNORE, rank);
|
||||||
ompi_datatype_destroy(&new_sdtype);
|
ompi_datatype_destroy(&new_sdtype);
|
||||||
ompi_datatype_destroy(&new_rdtype);
|
ompi_datatype_destroy(&new_rdtype);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
free(new_rcounts);
|
free(new_rcounts);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
|
|
||||||
err_hndl:
|
err_hndl:
|
||||||
if( NULL != new_rcounts ) free(new_rcounts);
|
if( NULL != new_rcounts ) free(new_rcounts);
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||||
__FILE__, line, err, rank));
|
__FILE__, line, err, rank));
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -221,12 +218,9 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
|
|||||||
void* rbuf, int *rcounts, int *rdisps,
|
void* rbuf, int *rcounts, int *rdisps,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int line = -1;
|
int line = -1, rank, size, sendto, recvfrom, i, recvdatafrom, senddatafrom, err = 0;
|
||||||
int rank, size;
|
|
||||||
int sendto, recvfrom, i, recvdatafrom, senddatafrom;
|
|
||||||
int err = 0;
|
|
||||||
ptrdiff_t slb, rlb, sext, rext;
|
ptrdiff_t slb, rlb, sext, rext;
|
||||||
char *tmpsend = NULL, *tmprecv = NULL;
|
char *tmpsend = NULL, *tmprecv = NULL;
|
||||||
|
|
||||||
@ -250,7 +244,7 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount,
|
|||||||
if (MPI_IN_PLACE != sbuf) {
|
if (MPI_IN_PLACE != sbuf) {
|
||||||
tmpsend = (char*) sbuf;
|
tmpsend = (char*) sbuf;
|
||||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
||||||
tmprecv, rcounts[rank], rdtype);
|
tmprecv, rcounts[rank], rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -354,14 +348,11 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
|||||||
void* rbuf, int *rcounts, int *rdispls,
|
void* rbuf, int *rcounts, int *rdispls,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int line = -1;
|
int line = -1, rank, size, i, even_rank, err = 0;
|
||||||
int rank, size;
|
|
||||||
int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
|
int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
|
||||||
int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
|
int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
|
||||||
int i, even_rank;
|
|
||||||
int err = 0;
|
|
||||||
ptrdiff_t slb, rlb, sext, rext;
|
ptrdiff_t slb, rlb, sext, rext;
|
||||||
char *tmpsend = NULL, *tmprecv = NULL;
|
char *tmpsend = NULL, *tmprecv = NULL;
|
||||||
struct ompi_datatype_t *new_rdtype, *new_sdtype;
|
struct ompi_datatype_t *new_rdtype, *new_sdtype;
|
||||||
@ -396,7 +387,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
|||||||
if (MPI_IN_PLACE != sbuf) {
|
if (MPI_IN_PLACE != sbuf) {
|
||||||
tmpsend = (char*) sbuf;
|
tmpsend = (char*) sbuf;
|
||||||
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
err = ompi_datatype_sndrcv(tmpsend, scount, sdtype,
|
||||||
tmprecv, rcounts[rank], rdtype);
|
tmprecv, rcounts[rank], rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -458,7 +449,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
|||||||
new_sdispls[0] = rdispls[send_data_from];
|
new_sdispls[0] = rdispls[send_data_from];
|
||||||
new_sdispls[1] = rdispls[(send_data_from + 1)];
|
new_sdispls[1] = rdispls[(send_data_from + 1)];
|
||||||
err = ompi_datatype_create_indexed(2, new_scounts, new_sdispls, rdtype,
|
err = ompi_datatype_create_indexed(2, new_scounts, new_sdispls, rdtype,
|
||||||
&new_sdtype);
|
&new_sdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
err = ompi_datatype_commit(&new_sdtype);
|
err = ompi_datatype_commit(&new_sdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
@ -468,7 +459,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount,
|
|||||||
new_rdispls[0] = rdispls[recv_data_from[i_parity]];
|
new_rdispls[0] = rdispls[recv_data_from[i_parity]];
|
||||||
new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
|
new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
|
||||||
err = ompi_datatype_create_indexed(2, new_rcounts, new_rdispls, rdtype,
|
err = ompi_datatype_create_indexed(2, new_rcounts, new_rdispls, rdtype,
|
||||||
&new_rdtype);
|
&new_rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
err = ompi_datatype_commit(&new_rdtype);
|
err = ompi_datatype_commit(&new_rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
@ -505,11 +496,9 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
|
|||||||
int *rdispls,
|
int *rdispls,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int line = -1, err = 0;
|
int line = -1, err = 0, rank, remote;
|
||||||
int rank;
|
|
||||||
int remote;
|
|
||||||
char *tmpsend = NULL, *tmprecv = NULL;
|
char *tmpsend = NULL, *tmprecv = NULL;
|
||||||
ptrdiff_t sext, rext, lb;
|
ptrdiff_t sext, rext, lb;
|
||||||
|
|
||||||
@ -548,8 +537,8 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount,
|
|||||||
/* Place your data in correct location if necessary */
|
/* Place your data in correct location if necessary */
|
||||||
if (MPI_IN_PLACE != sbuf) {
|
if (MPI_IN_PLACE != sbuf) {
|
||||||
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
|
err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype,
|
||||||
(char*)rbuf + (ptrdiff_t)rdispls[rank] * rext,
|
(char*)rbuf + (ptrdiff_t)rdispls[rank] * rext,
|
||||||
rcounts[rank], rdtype);
|
rcounts[rank], rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -591,12 +580,10 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
|
|||||||
int *disps,
|
int *disps,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i, size, rank ;
|
int i, size, rank, err;
|
||||||
int err;
|
MPI_Aint extent, lb;
|
||||||
MPI_Aint extent;
|
|
||||||
MPI_Aint lb;
|
|
||||||
char *send_buf = NULL;
|
char *send_buf = NULL;
|
||||||
struct ompi_datatype_t *newtype, *send_type;
|
struct ompi_datatype_t *newtype, *send_type;
|
||||||
|
|
||||||
@ -655,7 +642,7 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount,
|
|||||||
}
|
}
|
||||||
|
|
||||||
comm->c_coll.coll_bcast(rbuf, 1, newtype, 0, comm,
|
comm->c_coll.coll_bcast(rbuf, 1, newtype, 0, comm,
|
||||||
comm->c_coll.coll_bcast_module);
|
comm->c_coll.coll_bcast_module);
|
||||||
|
|
||||||
ompi_datatype_destroy (&newtype);
|
ompi_datatype_destroy (&newtype);
|
||||||
|
|
||||||
@ -736,14 +723,14 @@ int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount,
|
|||||||
int *rdispls,
|
int *rdispls,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:allgatherv_intra_do_forced selected algorithm %d",
|
"coll:tuned:allgatherv_intra_do_forced selected algorithm %d",
|
||||||
data->user_forced[ALLGATHERV].algorithm));
|
data->user_forced[ALLGATHERV].algorithm));
|
||||||
|
|
||||||
switch (data->user_forced[ALLGATHERV].algorithm) {
|
switch (data->user_forced[ALLGATHERV].algorithm) {
|
||||||
case (0):
|
case (0):
|
||||||
@ -756,7 +743,7 @@ int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount,
|
|||||||
comm, module);
|
comm, module);
|
||||||
case (2):
|
case (2):
|
||||||
return ompi_coll_tuned_allgatherv_intra_bruck (sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_bruck (sbuf, scount, sdtype,
|
||||||
rbuf, rcounts, rdispls, rdtype,
|
rbuf, rcounts, rdispls, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
case (3):
|
case (3):
|
||||||
return ompi_coll_tuned_allgatherv_intra_ring (sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_ring (sbuf, scount, sdtype,
|
||||||
@ -773,7 +760,7 @@ int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount,
|
|||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
"coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
data->user_forced[ALLGATHERV].algorithm,
|
data->user_forced[ALLGATHERV].algorithm,
|
||||||
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
|
ompi_coll_tuned_forced_max_algorithms[ALLGATHERV]));
|
||||||
return (MPI_ERR_ARG);
|
return (MPI_ERR_ARG);
|
||||||
} /* switch */
|
} /* switch */
|
||||||
@ -787,7 +774,7 @@ int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount,
|
|||||||
int *rdispls,
|
int *rdispls,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
int algorithm, int faninout,
|
int algorithm, int faninout,
|
||||||
int segsize)
|
int segsize)
|
||||||
{
|
{
|
||||||
@ -807,11 +794,11 @@ int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount,
|
|||||||
case (2):
|
case (2):
|
||||||
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
||||||
rbuf, rcounts, rdispls, rdtype,
|
rbuf, rcounts, rdispls, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
case (3):
|
case (3):
|
||||||
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||||
rbuf, rcounts, rdispls, rdtype,
|
rbuf, rcounts, rdispls, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
case (4):
|
case (4):
|
||||||
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
||||||
rbuf, rcounts, rdispls, rdtype,
|
rbuf, rcounts, rdispls, rdtype,
|
||||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -34,11 +34,9 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
|
|||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int line = -1, err = 0;
|
int line = -1, err = 0, rank, size, step, sendto, recvfrom;
|
||||||
int rank, size, step;
|
|
||||||
int sendto, recvfrom;
|
|
||||||
void * tmpsend, *tmprecv;
|
void * tmpsend, *tmprecv;
|
||||||
ptrdiff_t lb, sext, rext;
|
ptrdiff_t lb, sext, rext;
|
||||||
|
|
||||||
@ -89,15 +87,12 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
|||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i, k, line = -1;
|
int i, k, line = -1, rank, size, err = 0, weallocated = 0;
|
||||||
int rank, size;
|
|
||||||
int sendto, recvfrom, distance, *displs = NULL, *blen = NULL;
|
int sendto, recvfrom, distance, *displs = NULL, *blen = NULL;
|
||||||
char *tmpbuf = NULL, *tmpbuf_free = NULL;
|
char *tmpbuf = NULL, *tmpbuf_free = NULL;
|
||||||
ptrdiff_t rlb, slb, tlb, sext, rext, tsext;
|
ptrdiff_t rlb, slb, tlb, sext, rext, tsext;
|
||||||
int err = 0;
|
|
||||||
int weallocated = 0;
|
|
||||||
struct ompi_datatype_t *new_ddt;
|
struct ompi_datatype_t *new_ddt;
|
||||||
#ifdef blahblah
|
#ifdef blahblah
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
@ -147,17 +142,17 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
|||||||
|
|
||||||
/* Step 1 - local rotation - shift up by rank */
|
/* Step 1 - local rotation - shift up by rank */
|
||||||
err = ompi_datatype_copy_content_same_ddt (sdtype,
|
err = ompi_datatype_copy_content_same_ddt (sdtype,
|
||||||
(int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount),
|
(int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount),
|
||||||
tmpbuf,
|
tmpbuf,
|
||||||
((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext);
|
((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext);
|
||||||
if (err<0) {
|
if (err<0) {
|
||||||
line = __LINE__; err = -1; goto err_hndl;
|
line = __LINE__; err = -1; goto err_hndl;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rank != 0) {
|
if (rank != 0) {
|
||||||
err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount,
|
err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount,
|
||||||
tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
|
tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext,
|
||||||
(char*) sbuf);
|
(char*) sbuf);
|
||||||
if (err<0) {
|
if (err<0) {
|
||||||
line = __LINE__; err = -1; goto err_hndl;
|
line = __LINE__; err = -1; goto err_hndl;
|
||||||
}
|
}
|
||||||
@ -206,8 +201,8 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
|||||||
for (i = 0; i < size; i++) {
|
for (i = 0; i < size; i++) {
|
||||||
|
|
||||||
err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount,
|
err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount,
|
||||||
((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
|
((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext),
|
||||||
tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext);
|
tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext);
|
||||||
if (err < 0) { line = __LINE__; err = -1; goto err_hndl; }
|
if (err < 0) { line = __LINE__; err = -1; goto err_hndl; }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -253,18 +248,12 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
|||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
int max_outstanding_reqs)
|
int max_outstanding_reqs)
|
||||||
{
|
{
|
||||||
int line, error;
|
int line, error, ri, si, rank, size, nreqs, nrreqs, nsreqs, total_reqs;
|
||||||
int ri, si;
|
char *psnd, *prcv;
|
||||||
int rank;
|
ptrdiff_t slb, sext, rlb, rext;
|
||||||
int size;
|
|
||||||
int nreqs, nrreqs, nsreqs, total_reqs;
|
|
||||||
char *psnd;
|
|
||||||
char *prcv;
|
|
||||||
ptrdiff_t slb, sext;
|
|
||||||
ptrdiff_t rlb, rext;
|
|
||||||
|
|
||||||
ompi_request_t **reqs = NULL;
|
ompi_request_t **reqs = NULL;
|
||||||
|
|
||||||
@ -318,67 +307,67 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount,
|
|||||||
/* Post first batch or ireceive and isend requests */
|
/* Post first batch or ireceive and isend requests */
|
||||||
for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
|
for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
|
||||||
ri = (ri + 1) % size, ++nreqs, ++nrreqs) {
|
ri = (ri + 1) % size, ++nreqs, ++nrreqs) {
|
||||||
error =
|
error =
|
||||||
MCA_PML_CALL(irecv
|
MCA_PML_CALL(irecv
|
||||||
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
|
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
|
||||||
MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs]));
|
MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs]));
|
||||||
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
||||||
}
|
}
|
||||||
for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
|
for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
|
||||||
si = (si + size - 1) % size, ++nreqs, ++nsreqs) {
|
si = (si + size - 1) % size, ++nreqs, ++nsreqs) {
|
||||||
error =
|
error =
|
||||||
MCA_PML_CALL(isend
|
MCA_PML_CALL(isend
|
||||||
(psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
|
(psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
|
||||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs]));
|
MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs]));
|
||||||
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Wait for requests to complete */
|
/* Wait for requests to complete */
|
||||||
if (nreqs == 2 * (size - 1)) {
|
if (nreqs == 2 * (size - 1)) {
|
||||||
/* Optimization for the case when all requests have been posted */
|
/* Optimization for the case when all requests have been posted */
|
||||||
error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
|
error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
|
||||||
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
/* As requests complete, replace them with corresponding requests:
|
/* As requests complete, replace them with corresponding requests:
|
||||||
- wait for any request to complete, mark the request as
|
- wait for any request to complete, mark the request as
|
||||||
MPI_REQUEST_NULL
|
MPI_REQUEST_NULL
|
||||||
- If it was a receive request, replace it with new irecv request
|
- If it was a receive request, replace it with new irecv request
|
||||||
(if any)
|
(if any)
|
||||||
- if it was a send request, replace it with new isend request (if any)
|
- if it was a send request, replace it with new isend request (if any)
|
||||||
*/
|
*/
|
||||||
int ncreqs = 0;
|
int ncreqs = 0;
|
||||||
while (ncreqs < 2 * (size - 1)) {
|
while (ncreqs < 2 * (size - 1)) {
|
||||||
int completed;
|
int completed;
|
||||||
error = ompi_request_wait_any(2 * total_reqs, reqs, &completed,
|
error = ompi_request_wait_any(2 * total_reqs, reqs, &completed,
|
||||||
MPI_STATUS_IGNORE);
|
MPI_STATUS_IGNORE);
|
||||||
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
||||||
reqs[completed] = MPI_REQUEST_NULL;
|
reqs[completed] = MPI_REQUEST_NULL;
|
||||||
ncreqs++;
|
ncreqs++;
|
||||||
if (completed < total_reqs) {
|
if (completed < total_reqs) {
|
||||||
if (nrreqs < (size - 1)) {
|
if (nrreqs < (size - 1)) {
|
||||||
error =
|
error =
|
||||||
MCA_PML_CALL(irecv
|
MCA_PML_CALL(irecv
|
||||||
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
|
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
|
||||||
MCA_COLL_BASE_TAG_ALLTOALL, comm,
|
MCA_COLL_BASE_TAG_ALLTOALL, comm,
|
||||||
&reqs[completed]));
|
|
||||||
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
|
||||||
++nrreqs;
|
|
||||||
ri = (ri + 1) % size;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (nsreqs < (size - 1)) {
|
|
||||||
error = MCA_PML_CALL(isend
|
|
||||||
(psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
|
|
||||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm,
|
|
||||||
&reqs[completed]));
|
&reqs[completed]));
|
||||||
++nsreqs;
|
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
|
||||||
si = (si + size - 1) % size;
|
++nrreqs;
|
||||||
}
|
ri = (ri + 1) % size;
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
|
if (nsreqs < (size - 1)) {
|
||||||
|
error = MCA_PML_CALL(isend
|
||||||
|
(psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
|
||||||
|
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||||
|
MCA_PML_BASE_SEND_STANDARD, comm,
|
||||||
|
&reqs[completed]));
|
||||||
|
++nsreqs;
|
||||||
|
si = (si + size - 1) % size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Free the reqs */
|
/* Free the reqs */
|
||||||
@ -401,11 +390,9 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
|||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int line = -1, err = 0;
|
int line = -1, err = 0, rank, remote;
|
||||||
int rank;
|
|
||||||
int remote;
|
|
||||||
void * tmpsend, *tmprecv;
|
void * tmpsend, *tmprecv;
|
||||||
ptrdiff_t sext, rext, lb;
|
ptrdiff_t sext, rext, lb;
|
||||||
|
|
||||||
@ -436,9 +423,9 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
|||||||
|
|
||||||
/* ddt sendrecv your own data */
|
/* ddt sendrecv your own data */
|
||||||
err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount,
|
err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount,
|
||||||
(int32_t) scount, sdtype,
|
(int32_t) scount, sdtype,
|
||||||
(char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount,
|
(char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount,
|
||||||
(int32_t) rcount, rdtype);
|
(int32_t) rcount, rdtype);
|
||||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
/* done */
|
/* done */
|
||||||
@ -474,21 +461,10 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
|
|||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i;
|
int i, rank, size, err, nreqs;
|
||||||
int rank;
|
char *psnd, *prcv;
|
||||||
int size;
|
MPI_Aint lb, sndinc, rcvinc;
|
||||||
int err;
|
ompi_request_t **req, **sreq, **rreq;
|
||||||
int nreqs;
|
|
||||||
char *psnd;
|
|
||||||
char *prcv;
|
|
||||||
MPI_Aint lb;
|
|
||||||
MPI_Aint sndinc;
|
|
||||||
MPI_Aint rcvinc;
|
|
||||||
|
|
||||||
ompi_request_t **req;
|
|
||||||
ompi_request_t **sreq;
|
|
||||||
ompi_request_t **rreq;
|
|
||||||
|
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
|
|
||||||
@ -653,12 +629,12 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm
|
|||||||
NULL);
|
NULL);
|
||||||
|
|
||||||
mca_param_indices->max_requests_param_index
|
mca_param_indices->max_requests_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"alltoall_algorithm_max_requests",
|
"alltoall_algorithm_max_requests",
|
||||||
"Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.",
|
"Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.",
|
||||||
false, false,
|
false, false,
|
||||||
ompi_coll_tuned_init_max_requests, /* get system wide default */
|
ompi_coll_tuned_init_max_requests, /* get system wide default */
|
||||||
NULL);
|
NULL);
|
||||||
if (mca_param_indices->max_requests_param_index < 0) {
|
if (mca_param_indices->max_requests_param_index < 0) {
|
||||||
return mca_param_indices->algorithm_param_index;
|
return mca_param_indices->algorithm_param_index;
|
||||||
}
|
}
|
||||||
@ -682,7 +658,7 @@ int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
|
|||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
@ -711,7 +687,7 @@ int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
|
|||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
int algorithm, int faninout, int segsize,
|
int algorithm, int faninout, int segsize,
|
||||||
int max_requests)
|
int max_requests)
|
||||||
{
|
{
|
||||||
|
@ -38,9 +38,7 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
|
|||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int line = -1, err = 0;
|
int line = -1, err = 0, rank, size, step, sendto, recvfrom;
|
||||||
int rank, size, step;
|
|
||||||
int sendto, recvfrom;
|
|
||||||
void *psnd, *prcv;
|
void *psnd, *prcv;
|
||||||
ptrdiff_t sext, rext;
|
ptrdiff_t sext, rext;
|
||||||
|
|
||||||
@ -58,7 +56,7 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps,
|
|||||||
|
|
||||||
if (0 != scounts[rank]) {
|
if (0 != scounts[rank]) {
|
||||||
err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype,
|
err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype,
|
||||||
prcv, rcounts[rank], rdtype);
|
prcv, rcounts[rank], rdtype);
|
||||||
if (MPI_SUCCESS != err) {
|
if (MPI_SUCCESS != err) {
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
@ -115,9 +113,8 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis
|
|||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i, size, rank, err;
|
int i, size, rank, err, nreqs;
|
||||||
char *psnd, *prcv;
|
char *psnd, *prcv;
|
||||||
int nreqs;
|
|
||||||
ptrdiff_t sext, rext;
|
ptrdiff_t sext, rext;
|
||||||
MPI_Request *preq;
|
MPI_Request *preq;
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
@ -305,16 +302,16 @@ int ompi_coll_tuned_alltoallv_intra_do_this(void *sbuf, int *scounts, int *sdisp
|
|||||||
switch (algorithm) {
|
switch (algorithm) {
|
||||||
case (0):
|
case (0):
|
||||||
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
|
return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype,
|
||||||
rbuf, rcounts, rdisps, rdtype,
|
rbuf, rcounts, rdisps, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
case (1):
|
case (1):
|
||||||
return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
|
return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype,
|
||||||
rbuf, rcounts, rdisps, rdtype,
|
rbuf, rcounts, rdisps, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
case (2):
|
case (2):
|
||||||
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
||||||
rbuf, rcounts, rdisps, rdtype,
|
rbuf, rcounts, rdisps, rdtype,
|
||||||
comm, module);
|
comm, module);
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:alltoall_intra_do_this attempt to select "
|
"coll:tuned:alltoall_intra_do_this attempt to select "
|
||||||
|
@ -50,12 +50,9 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
|
int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int rank, size;
|
int rank, size, err = 0, line = 0, left, right;
|
||||||
int err=0, line=0;
|
|
||||||
int left, right;
|
|
||||||
|
|
||||||
|
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
@ -122,11 +119,9 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm,
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
|
int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int rank, size, adjsize;
|
int rank, size, adjsize, err, line, mask, remote;
|
||||||
int err, line;
|
|
||||||
int mask, remote;
|
|
||||||
|
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
@ -206,11 +201,9 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
|
int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int rank, size;
|
int rank, size, distance, to, from, err, line = 0;
|
||||||
int distance, to, from;
|
|
||||||
int err, line = 0;
|
|
||||||
|
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
@ -245,7 +238,7 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm,
|
|||||||
*/
|
*/
|
||||||
/* special case for two processes */
|
/* special case for two processes */
|
||||||
int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
|
int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int remote, err;
|
int remote, err;
|
||||||
|
|
||||||
@ -278,14 +271,14 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm,
|
|||||||
/* copied function (with appropriate renaming) starts here */
|
/* copied function (with appropriate renaming) starts here */
|
||||||
|
|
||||||
static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
|
static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i, err;
|
int i, err, rank, size;
|
||||||
int size = ompi_comm_size(comm);
|
|
||||||
int rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
size = ompi_comm_size(comm);
|
||||||
|
|
||||||
/* All non-root send & receive zero-length message. */
|
/* All non-root send & receive zero-length message. */
|
||||||
|
|
||||||
if (rank > 0) {
|
if (rank > 0) {
|
||||||
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
|
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
|
||||||
MCA_COLL_BASE_TAG_BARRIER,
|
MCA_COLL_BASE_TAG_BARRIER,
|
||||||
@ -345,8 +338,7 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t
|
|||||||
int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
|
int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int rank, size, depth;
|
int rank, size, depth, err, jump, partner;
|
||||||
int err, jump, partner;
|
|
||||||
|
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
@ -376,7 +368,7 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
depth>>=1;
|
depth >>= 1;
|
||||||
for (jump = depth; jump>0; jump>>=1) {
|
for (jump = depth; jump>0; jump>>=1) {
|
||||||
partner = rank ^ jump;
|
partner = rank ^ jump;
|
||||||
if (!(partner & (jump-1)) && partner < size) {
|
if (!(partner & (jump-1)) && partner < size) {
|
||||||
@ -423,10 +415,10 @@ int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_
|
|||||||
false, true, max_alg, NULL);
|
false, true, max_alg, NULL);
|
||||||
|
|
||||||
mca_param_indices->algorithm_param_index =
|
mca_param_indices->algorithm_param_index =
|
||||||
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"barrier_algorithm",
|
"barrier_algorithm",
|
||||||
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree",
|
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree",
|
||||||
false, false, 0, NULL);
|
false, false, 0, NULL);
|
||||||
if (mca_param_indices->algorithm_param_index < 0) {
|
if (mca_param_indices->algorithm_param_index < 0) {
|
||||||
return mca_param_indices->algorithm_param_index;
|
return mca_param_indices->algorithm_param_index;
|
||||||
}
|
}
|
||||||
|
@ -35,24 +35,20 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
|||||||
struct ompi_datatype_t* datatype,
|
struct ompi_datatype_t* datatype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t count_by_segment,
|
uint32_t count_by_segment,
|
||||||
ompi_coll_tree_t* tree )
|
ompi_coll_tree_t* tree )
|
||||||
{
|
{
|
||||||
int err = 0, line, i;
|
int err = 0, line, i, rank, size, segindex, req_index;
|
||||||
int rank, size;
|
|
||||||
int segindex;
|
|
||||||
int num_segments; /* Number of segments */
|
int num_segments; /* Number of segments */
|
||||||
int sendcount; /* number of elements sent in this segment */
|
int sendcount; /* number of elements sent in this segment */
|
||||||
size_t realsegsize;
|
size_t realsegsize, type_size;
|
||||||
char *tmpbuf;
|
char *tmpbuf;
|
||||||
size_t type_size;
|
|
||||||
ptrdiff_t extent, lb;
|
ptrdiff_t extent, lb;
|
||||||
ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
|
||||||
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
#if !defined(COLL_TUNED_BCAST_USE_BLOCKING)
|
||||||
ompi_request_t **send_reqs = NULL;
|
ompi_request_t **send_reqs = NULL;
|
||||||
#endif
|
#endif
|
||||||
int req_index;
|
|
||||||
|
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
@ -78,7 +74,7 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
|||||||
/*
|
/*
|
||||||
For each segment:
|
For each segment:
|
||||||
- send segment to all children.
|
- send segment to all children.
|
||||||
The last segment may have less elements than other segments.
|
The last segment may have less elements than other segments.
|
||||||
*/
|
*/
|
||||||
sendcount = count_by_segment;
|
sendcount = count_by_segment;
|
||||||
for( segindex = 0; segindex < num_segments; segindex++ ) {
|
for( segindex = 0; segindex < num_segments; segindex++ ) {
|
||||||
@ -120,13 +116,13 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
|||||||
Create the pipeline.
|
Create the pipeline.
|
||||||
1) Post the first receive
|
1) Post the first receive
|
||||||
2) For segments 1 .. num_segments
|
2) For segments 1 .. num_segments
|
||||||
- post new receive
|
- post new receive
|
||||||
- wait on the previous receive to complete
|
- wait on the previous receive to complete
|
||||||
- send this data to children
|
- send this data to children
|
||||||
3) Wait on the last segment
|
3) Wait on the last segment
|
||||||
4) Compute number of elements in last segment.
|
4) Compute number of elements in last segment.
|
||||||
5) Send the last segment to children
|
5) Send the last segment to children
|
||||||
*/
|
*/
|
||||||
req_index = 0;
|
req_index = 0;
|
||||||
MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
|
MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype,
|
||||||
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
|
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
|
||||||
@ -210,8 +206,8 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer,
|
|||||||
Receive all segments from parent in a loop:
|
Receive all segments from parent in a loop:
|
||||||
1) post irecv for the first segment
|
1) post irecv for the first segment
|
||||||
2) for segments 1 .. num_segments
|
2) for segments 1 .. num_segments
|
||||||
- post irecv for the next segment
|
- post irecv for the next segment
|
||||||
- wait on the previous segment to arrive
|
- wait on the previous segment to arrive
|
||||||
3) wait for the last segment
|
3) wait for the last segment
|
||||||
*/
|
*/
|
||||||
req_index = 0;
|
req_index = 0;
|
||||||
@ -259,7 +255,7 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
|||||||
struct ompi_datatype_t* datatype,
|
struct ompi_datatype_t* datatype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize )
|
uint32_t segsize )
|
||||||
{
|
{
|
||||||
int segcount = count;
|
int segcount = count;
|
||||||
@ -288,7 +284,7 @@ ompi_coll_tuned_bcast_intra_pipeline( void* buffer,
|
|||||||
struct ompi_datatype_t* datatype,
|
struct ompi_datatype_t* datatype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize )
|
uint32_t segsize )
|
||||||
{
|
{
|
||||||
int segcount = count;
|
int segcount = count;
|
||||||
@ -317,7 +313,7 @@ ompi_coll_tuned_bcast_intra_chain( void* buffer,
|
|||||||
struct ompi_datatype_t* datatype,
|
struct ompi_datatype_t* datatype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize, int32_t chains )
|
uint32_t segsize, int32_t chains )
|
||||||
{
|
{
|
||||||
int segcount = count;
|
int segcount = count;
|
||||||
@ -346,7 +342,7 @@ ompi_coll_tuned_bcast_intra_binomial( void* buffer,
|
|||||||
struct ompi_datatype_t* datatype,
|
struct ompi_datatype_t* datatype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize )
|
uint32_t segsize )
|
||||||
{
|
{
|
||||||
int segcount = count;
|
int segcount = count;
|
||||||
@ -375,19 +371,16 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
|||||||
struct ompi_datatype_t* datatype,
|
struct ompi_datatype_t* datatype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
uint32_t segsize )
|
uint32_t segsize )
|
||||||
{
|
{
|
||||||
int err=0, line;
|
int err=0, line, rank, size, segindex, i, lr, pair;
|
||||||
int rank, size;
|
|
||||||
int segindex, i, lr, pair;
|
|
||||||
int segcount[2]; /* Number of elements sent with each segment */
|
|
||||||
uint32_t counts[2];
|
uint32_t counts[2];
|
||||||
|
int segcount[2]; /* Number of elements sent with each segment */
|
||||||
int num_segments[2]; /* Number of segmenets */
|
int num_segments[2]; /* Number of segmenets */
|
||||||
int sendcount[2]; /* the same like segcount, except for the last segment */
|
int sendcount[2]; /* the same like segcount, except for the last segment */
|
||||||
size_t realsegsize[2];
|
size_t realsegsize[2], type_size;
|
||||||
char *tmpbuf[2];
|
char *tmpbuf[2];
|
||||||
size_t type_size;
|
|
||||||
ptrdiff_t type_extent, lb;
|
ptrdiff_t type_extent, lb;
|
||||||
ompi_request_t *base_req, *new_req;
|
ompi_request_t *base_req, *new_req;
|
||||||
ompi_coll_tree_t *tree;
|
ompi_coll_tree_t *tree;
|
||||||
@ -438,7 +431,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
|||||||
/* call linear version here ! */
|
/* call linear version here ! */
|
||||||
return (ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype,
|
return (ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
segsize, 1 ));
|
segsize, 1 ));
|
||||||
}
|
}
|
||||||
|
|
||||||
err = ompi_datatype_get_extent (datatype, &lb, &type_extent);
|
err = ompi_datatype_get_extent (datatype, &lb, &type_extent);
|
||||||
@ -644,16 +637,12 @@ int
|
|||||||
ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
|
ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
|
||||||
struct ompi_datatype_t *datatype, int root,
|
struct ompi_datatype_t *datatype, int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i;
|
int i, size, rank, err;
|
||||||
int size;
|
|
||||||
int rank;
|
|
||||||
int err;
|
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
ompi_request_t **preq;
|
ompi_request_t **preq, **reqs = data->mcct_reqs;
|
||||||
ompi_request_t **reqs = data->mcct_reqs;
|
|
||||||
|
|
||||||
|
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
@ -779,7 +768,7 @@ int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
@ -813,7 +802,7 @@ int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
int algorithm, int faninout, int segsize)
|
int algorithm, int faninout, int segsize)
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -79,7 +79,7 @@ ompi_coll_msg_rule_t* ompi_coll_tuned_mk_msg_rules (int n_msg_rules, int alg_rul
|
|||||||
msg_rules = (ompi_coll_msg_rule_t *) calloc (n_msg_rules, sizeof (ompi_coll_msg_rule_t));
|
msg_rules = (ompi_coll_msg_rule_t *) calloc (n_msg_rules, sizeof (ompi_coll_msg_rule_t));
|
||||||
if (!msg_rules) return (msg_rules);
|
if (!msg_rules) return (msg_rules);
|
||||||
|
|
||||||
for (i=0;i<n_msg_rules;i++) {
|
for( i = 0; i < n_msg_rules; i++ ) {
|
||||||
msg_rules[i].mpi_comsize = mpi_comsize;
|
msg_rules[i].mpi_comsize = mpi_comsize;
|
||||||
msg_rules[i].alg_rule_id = alg_rule_id;
|
msg_rules[i].alg_rule_id = alg_rule_id;
|
||||||
msg_rules[i].com_rule_id = com_rule_id;
|
msg_rules[i].com_rule_id = com_rule_id;
|
||||||
@ -98,8 +98,6 @@ ompi_coll_msg_rule_t* ompi_coll_tuned_mk_msg_rules (int n_msg_rules, int alg_rul
|
|||||||
* Debug / IO routines
|
* Debug / IO routines
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p)
|
int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p)
|
||||||
{
|
{
|
||||||
if (!msg_p) {
|
if (!msg_p) {
|
||||||
@ -193,8 +191,6 @@ int ompi_coll_tuned_dump_all_rules (ompi_coll_alg_rule_t* alg_p, int n_rules)
|
|||||||
* Memory free routines
|
* Memory free routines
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
int ompi_coll_tuned_free_msg_rules_in_com_rule (ompi_coll_com_rule_t* com_p)
|
int ompi_coll_tuned_free_msg_rules_in_com_rule (ompi_coll_com_rule_t* com_p)
|
||||||
{
|
{
|
||||||
int rc=0;
|
int rc=0;
|
||||||
@ -224,7 +220,6 @@ int ompi_coll_tuned_free_msg_rules_in_com_rule (ompi_coll_com_rule_t* com_p)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int ompi_coll_tuned_free_coms_in_alg_rule (ompi_coll_alg_rule_t* alg_p)
|
int ompi_coll_tuned_free_coms_in_alg_rule (ompi_coll_alg_rule_t* alg_p)
|
||||||
{
|
{
|
||||||
int rc=0;
|
int rc=0;
|
||||||
@ -242,10 +237,9 @@ int ompi_coll_tuned_free_coms_in_alg_rule (ompi_coll_alg_rule_t* alg_p)
|
|||||||
|
|
||||||
if (!com_p) {
|
if (!com_p) {
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL com_rules when com count was %d\n", alg_p->n_com_sizes));
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL com_rules when com count was %d\n", alg_p->n_com_sizes));
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
/* ok, memory exists for the com rules so free their message rules first */
|
/* ok, memory exists for the com rules so free their message rules first */
|
||||||
for (i=0;i<alg_p->n_com_sizes;i++) {
|
for( i = 0; i < alg_p->n_com_sizes; i++ ) {
|
||||||
com_p = &(alg_p->com_rules[i]);
|
com_p = &(alg_p->com_rules[i]);
|
||||||
ompi_coll_tuned_free_msg_rules_in_com_rule (com_p);
|
ompi_coll_tuned_free_msg_rules_in_com_rule (com_p);
|
||||||
}
|
}
|
||||||
@ -265,7 +259,7 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
|
|||||||
int i;
|
int i;
|
||||||
int rc = 0;
|
int rc = 0;
|
||||||
|
|
||||||
for(i=0;i<n_algs;i++) {
|
for( i = 0; i < n_algs; i++ ) {
|
||||||
rc += ompi_coll_tuned_free_coms_in_alg_rule (&(alg_p[i]));
|
rc += ompi_coll_tuned_free_coms_in_alg_rule (&(alg_p[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -274,8 +268,6 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
|
|||||||
return (rc);
|
return (rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* query functions
|
* query functions
|
||||||
* i.e. the functions that get me the algorithm, topo fanin/out and segment size fast
|
* i.e. the functions that get me the algorithm, topo fanin/out and segment size fast
|
||||||
|
@ -27,47 +27,47 @@ BEGIN_C_DECLS
|
|||||||
|
|
||||||
|
|
||||||
typedef struct msg_rule_s {
|
typedef struct msg_rule_s {
|
||||||
/* paranoid / debug */
|
/* paranoid / debug */
|
||||||
int mpi_comsize; /* which MPI comm size this is is for */
|
int mpi_comsize; /* which MPI comm size this is is for */
|
||||||
|
|
||||||
/* paranoid / debug */
|
/* paranoid / debug */
|
||||||
int alg_rule_id; /* unique alg rule id */
|
int alg_rule_id; /* unique alg rule id */
|
||||||
int com_rule_id; /* unique com rule id */
|
int com_rule_id; /* unique com rule id */
|
||||||
int msg_rule_id; /* unique msg rule id */
|
int msg_rule_id; /* unique msg rule id */
|
||||||
|
|
||||||
/* RULE */
|
/* RULE */
|
||||||
size_t msg_size; /* message size */
|
size_t msg_size; /* message size */
|
||||||
|
|
||||||
/* RESULT */
|
/* RESULT */
|
||||||
int result_alg; /* result algorithm to use */
|
int result_alg; /* result algorithm to use */
|
||||||
int result_topo_faninout; /* result topology fan in/out to use (if applicable) */
|
int result_topo_faninout; /* result topology fan in/out to use (if applicable) */
|
||||||
long result_segsize; /* result segment size to use */
|
long result_segsize; /* result segment size to use */
|
||||||
int result_max_requests; /* maximum number of outstanding requests (if applicable) */
|
int result_max_requests; /* maximum number of outstanding requests (if applicable) */
|
||||||
} ompi_coll_msg_rule_t;
|
} ompi_coll_msg_rule_t;
|
||||||
|
|
||||||
|
|
||||||
typedef struct com_rule_s {
|
typedef struct com_rule_s {
|
||||||
/* paranoid / debug */
|
/* paranoid / debug */
|
||||||
int mpi_comsize; /* which MPI comm size this is is for */
|
int mpi_comsize; /* which MPI comm size this is is for */
|
||||||
|
|
||||||
/* paranoid / debug */
|
/* paranoid / debug */
|
||||||
int alg_rule_id; /* unique alg rule id */
|
int alg_rule_id; /* unique alg rule id */
|
||||||
int com_rule_id; /* unique com rule id */
|
int com_rule_id; /* unique com rule id */
|
||||||
|
|
||||||
/* RULE */
|
/* RULE */
|
||||||
int n_msg_sizes;
|
int n_msg_sizes;
|
||||||
ompi_coll_msg_rule_t *msg_rules;
|
ompi_coll_msg_rule_t *msg_rules;
|
||||||
|
|
||||||
} ompi_coll_com_rule_t;
|
} ompi_coll_com_rule_t;
|
||||||
|
|
||||||
|
|
||||||
typedef struct alg_rule_s {
|
typedef struct alg_rule_s {
|
||||||
/* paranoid / debug */
|
/* paranoid / debug */
|
||||||
int alg_rule_id; /* unique alg rule id */
|
int alg_rule_id; /* unique alg rule id */
|
||||||
|
|
||||||
/* RULE */
|
/* RULE */
|
||||||
int n_com_sizes;
|
int n_com_sizes;
|
||||||
ompi_coll_com_rule_t *com_rules;
|
ompi_coll_com_rule_t *com_rules;
|
||||||
|
|
||||||
} ompi_coll_alg_rule_t;
|
} ompi_coll_alg_rule_t;
|
||||||
|
|
||||||
|
@ -34,22 +34,15 @@
|
|||||||
* gather_intra_pipeline, segmentation? */
|
* gather_intra_pipeline, segmentation? */
|
||||||
int
|
int
|
||||||
ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void *rbuf, int rcount,
|
void *rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int line = -1;
|
int line = -1, i, rank, vrank, size, total_recv = 0, err;
|
||||||
int i;
|
char *ptmp = NULL, *tempbuf = NULL;
|
||||||
int rank;
|
|
||||||
int vrank;
|
|
||||||
int size;
|
|
||||||
int total_recv = 0;
|
|
||||||
char *ptmp = NULL;
|
|
||||||
char *tempbuf = NULL;
|
|
||||||
int err;
|
|
||||||
ompi_coll_tree_t* bmtree;
|
ompi_coll_tree_t* bmtree;
|
||||||
MPI_Status status;
|
MPI_Status status;
|
||||||
MPI_Aint sextent, slb, strue_lb, strue_extent;
|
MPI_Aint sextent, slb, strue_lb, strue_extent;
|
||||||
@ -61,7 +54,7 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
|||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_gather_intra_binomial rank %d", rank));
|
"ompi_coll_tuned_gather_intra_binomial rank %d", rank));
|
||||||
|
|
||||||
/* create the binomial tree */
|
/* create the binomial tree */
|
||||||
COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
|
COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
|
||||||
@ -75,127 +68,127 @@ ompi_coll_tuned_gather_intra_binomial(void *sbuf, int scount,
|
|||||||
if (rank == root) {
|
if (rank == root) {
|
||||||
ompi_datatype_get_extent(rdtype, &rlb, &rextent);
|
ompi_datatype_get_extent(rdtype, &rlb, &rextent);
|
||||||
ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent);
|
ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent);
|
||||||
if (0 == root){
|
if (0 == root){
|
||||||
/* root on 0, just use the recv buffer */
|
/* root on 0, just use the recv buffer */
|
||||||
ptmp = (char *) rbuf;
|
ptmp = (char *) rbuf;
|
||||||
if (sbuf != MPI_IN_PLACE) {
|
if (sbuf != MPI_IN_PLACE) {
|
||||||
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||||
ptmp, rcount, rdtype);
|
ptmp, rcount, rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* root is not on 0, allocate temp buffer for recv,
|
/* root is not on 0, allocate temp buffer for recv,
|
||||||
* rotate data at the end */
|
* rotate data at the end */
|
||||||
tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent);
|
tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent);
|
||||||
if (NULL == tempbuf) {
|
if (NULL == tempbuf) {
|
||||||
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ptmp = tempbuf - rlb;
|
ptmp = tempbuf - rlb;
|
||||||
if (sbuf != MPI_IN_PLACE) {
|
if (sbuf != MPI_IN_PLACE) {
|
||||||
/* copy from sbuf to temp buffer */
|
/* copy from sbuf to temp buffer */
|
||||||
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||||
ptmp, rcount, rdtype);
|
ptmp, rcount, rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
} else {
|
} else {
|
||||||
/* copy from rbuf to temp buffer */
|
/* copy from rbuf to temp buffer */
|
||||||
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
|
err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp,
|
||||||
(char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount);
|
(char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
total_recv = rcount;
|
total_recv = rcount;
|
||||||
} else if (!(vrank % 2)) {
|
} else if (!(vrank % 2)) {
|
||||||
/* other non-leaf nodes, allocate temp buffer for data received from
|
/* other non-leaf nodes, allocate temp buffer for data received from
|
||||||
* children, the most we need is half of the total data elements due
|
* children, the most we need is half of the total data elements due
|
||||||
* to the property of binimoal tree */
|
* to the property of binimoal tree */
|
||||||
tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent);
|
tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent);
|
||||||
if (NULL == tempbuf) {
|
if (NULL == tempbuf) {
|
||||||
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ptmp = tempbuf - slb;
|
ptmp = tempbuf - slb;
|
||||||
/* local copy to tempbuf */
|
/* local copy to tempbuf */
|
||||||
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||||
ptmp, scount, sdtype);
|
ptmp, scount, sdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
/* use sdtype,scount as rdtype,rdcount since they are ignored on
|
/* use sdtype,scount as rdtype,rdcount since they are ignored on
|
||||||
* non-root procs */
|
* non-root procs */
|
||||||
rdtype = sdtype;
|
rdtype = sdtype;
|
||||||
rcount = scount;
|
rcount = scount;
|
||||||
rextent = sextent;
|
rextent = sextent;
|
||||||
total_recv = rcount;
|
total_recv = rcount;
|
||||||
} else {
|
} else {
|
||||||
/* leaf nodes, no temp buffer needed, use sdtype,scount as
|
/* leaf nodes, no temp buffer needed, use sdtype,scount as
|
||||||
* rdtype,rdcount since they are ignored on non-root procs */
|
* rdtype,rdcount since they are ignored on non-root procs */
|
||||||
ptmp = (char *) sbuf;
|
ptmp = (char *) sbuf;
|
||||||
total_recv = scount;
|
total_recv = scount;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(vrank % 2)) {
|
if (!(vrank % 2)) {
|
||||||
/* all non-leaf nodes recv from children */
|
/* all non-leaf nodes recv from children */
|
||||||
for (i = 0; i < bmtree->tree_nextsize; i++) {
|
for (i = 0; i < bmtree->tree_nextsize; i++) {
|
||||||
int mycount = 0, vkid;
|
int mycount = 0, vkid;
|
||||||
/* figure out how much data I have to send to this child */
|
/* figure out how much data I have to send to this child */
|
||||||
vkid = (bmtree->tree_next[i] - root + size) % size;
|
vkid = (bmtree->tree_next[i] - root + size) % size;
|
||||||
mycount = vkid - vrank;
|
mycount = vkid - vrank;
|
||||||
if (mycount > (size - vkid))
|
if (mycount > (size - vkid))
|
||||||
mycount = size - vkid;
|
mycount = size - vkid;
|
||||||
mycount *= rcount;
|
mycount *= rcount;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_gather_intra_binomial rank %d recv %d mycount = %d",
|
"ompi_coll_tuned_gather_intra_binomial rank %d recv %d mycount = %d",
|
||||||
rank, bmtree->tree_next[i], mycount));
|
rank, bmtree->tree_next[i], mycount));
|
||||||
|
|
||||||
err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype,
|
err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype,
|
||||||
bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER,
|
bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER,
|
||||||
comm, &status));
|
comm, &status));
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
total_recv += mycount;
|
total_recv += mycount;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rank != root) {
|
if (rank != root) {
|
||||||
/* all nodes except root send to parents */
|
/* all nodes except root send to parents */
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_gather_intra_binomial rank %d send %d count %d\n",
|
"ompi_coll_tuned_gather_intra_binomial rank %d send %d count %d\n",
|
||||||
rank, bmtree->tree_prev, total_recv));
|
rank, bmtree->tree_prev, total_recv));
|
||||||
|
|
||||||
err = MCA_PML_CALL(send(ptmp, total_recv, sdtype,
|
err = MCA_PML_CALL(send(ptmp, total_recv, sdtype,
|
||||||
bmtree->tree_prev,
|
bmtree->tree_prev,
|
||||||
MCA_COLL_BASE_TAG_GATHER,
|
MCA_COLL_BASE_TAG_GATHER,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rank == root) {
|
if (rank == root) {
|
||||||
if (root != 0) {
|
if (root != 0) {
|
||||||
/* rotate received data on root if root != 0 */
|
/* rotate received data on root if root != 0 */
|
||||||
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root),
|
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root),
|
||||||
(char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp);
|
(char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
|
|
||||||
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root,
|
err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root,
|
||||||
(char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root));
|
(char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root));
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
free(tempbuf);
|
free(tempbuf);
|
||||||
}
|
}
|
||||||
} else if (!(vrank % 2)) {
|
} else if (!(vrank % 2)) {
|
||||||
/* other non-leaf nodes */
|
/* other non-leaf nodes */
|
||||||
free(tempbuf);
|
free(tempbuf);
|
||||||
}
|
}
|
||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
|
|
||||||
err_hndl:
|
err_hndl:
|
||||||
if (NULL != tempbuf)
|
if (NULL != tempbuf)
|
||||||
free(tempbuf);
|
free(tempbuf);
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||||
__FILE__, line, err, rank));
|
__FILE__, line, err, rank));
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -213,23 +206,18 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
|||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
int first_segment_size)
|
int first_segment_size)
|
||||||
{
|
{
|
||||||
int i;
|
int i, ret, line, rank, size, first_segment_count;
|
||||||
int ret, line;
|
MPI_Aint extent, lb;
|
||||||
int rank, size;
|
|
||||||
int first_segment_count;
|
|
||||||
size_t typelng;
|
size_t typelng;
|
||||||
MPI_Aint extent;
|
|
||||||
MPI_Aint lb;
|
|
||||||
|
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));
|
"ompi_coll_tuned_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size));
|
||||||
|
|
||||||
if (rank != root) {
|
if (rank != root) {
|
||||||
/* Non-root processes:
|
/* Non-root processes:
|
||||||
@ -259,18 +247,18 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
|||||||
root, MCA_COLL_BASE_TAG_GATHER,
|
root, MCA_COLL_BASE_TAG_GATHER,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||||
}
|
|
||||||
|
|
||||||
else {
|
} else {
|
||||||
|
|
||||||
/* Root process,
|
/* Root process,
|
||||||
- For every non-root node:
|
- For every non-root node:
|
||||||
- post irecv for the first segment of the message
|
- post irecv for the first segment of the message
|
||||||
- send zero byte message to signal node to send the message
|
- send zero byte message to signal node to send the message
|
||||||
- post irecv for the second segment of the message
|
- post irecv for the second segment of the message
|
||||||
- wait for the first segment to complete
|
- wait for the first segment to complete
|
||||||
- Copy local data if necessary
|
- Copy local data if necessary
|
||||||
- Waitall for all the second segments to complete.
|
- Waitall for all the second segments to complete.
|
||||||
*/
|
*/
|
||||||
char *ptmp;
|
char *ptmp;
|
||||||
ompi_request_t **reqs = NULL, *first_segment_req;
|
ompi_request_t **reqs = NULL, *first_segment_req;
|
||||||
reqs = (ompi_request_t**) calloc(size, sizeof(ompi_request_t*));
|
reqs = (ompi_request_t**) calloc(size, sizeof(ompi_request_t*));
|
||||||
@ -318,8 +306,8 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
|||||||
/* copy local data if necessary */
|
/* copy local data if necessary */
|
||||||
if (MPI_IN_PLACE != sbuf) {
|
if (MPI_IN_PLACE != sbuf) {
|
||||||
ret = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
ret = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||||
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
|
(char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent,
|
||||||
rcount, rdtype);
|
rcount, rdtype);
|
||||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -362,28 +350,23 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount,
|
|||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
|
ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void *rbuf, int rcount,
|
void *rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i;
|
int i, err, rank, size;
|
||||||
int err;
|
|
||||||
int rank;
|
|
||||||
int size;
|
|
||||||
char *ptmp;
|
char *ptmp;
|
||||||
MPI_Aint incr;
|
MPI_Aint incr, extent, lb;
|
||||||
MPI_Aint extent;
|
|
||||||
MPI_Aint lb;
|
|
||||||
|
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
/* Everyone but root sends data and returns. */
|
/* Everyone but root sends data and returns. */
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"ompi_coll_tuned_gather_intra_basic_linear rank %d", rank));
|
"ompi_coll_tuned_gather_intra_basic_linear rank %d", rank));
|
||||||
|
|
||||||
if (rank != root) {
|
if (rank != root) {
|
||||||
return MCA_PML_CALL(send(sbuf, scount, sdtype, root,
|
return MCA_PML_CALL(send(sbuf, scount, sdtype, root,
|
||||||
@ -399,7 +382,7 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount,
|
|||||||
if (i == rank) {
|
if (i == rank) {
|
||||||
if (MPI_IN_PLACE != sbuf) {
|
if (MPI_IN_PLACE != sbuf) {
|
||||||
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||||
ptmp, rcount, rdtype);
|
ptmp, rcount, rdtype);
|
||||||
} else {
|
} else {
|
||||||
err = MPI_SUCCESS;
|
err = MPI_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -489,88 +472,85 @@ ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_pa
|
|||||||
|
|
||||||
int
|
int
|
||||||
ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount,
|
ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:gather_intra_do_forced selected algorithm %d",
|
"coll:tuned:gather_intra_do_forced selected algorithm %d",
|
||||||
data->user_forced[GATHER].algorithm));
|
data->user_forced[GATHER].algorithm));
|
||||||
|
|
||||||
switch (data->user_forced[GATHER].algorithm) {
|
switch (data->user_forced[GATHER].algorithm) {
|
||||||
case (0):
|
case (0):
|
||||||
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (1):
|
case (1):
|
||||||
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (2):
|
case (2):
|
||||||
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (3):
|
case (3):
|
||||||
{
|
|
||||||
const int first_segment_size = data->user_forced[GATHER].segsize;
|
|
||||||
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
first_segment_size);
|
data->user_forced[GATHER].segsize);
|
||||||
}
|
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
"coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
data->user_forced[GATHER].algorithm,
|
data->user_forced[GATHER].algorithm,
|
||||||
ompi_coll_tuned_forced_max_algorithms[GATHER]));
|
ompi_coll_tuned_forced_max_algorithms[GATHER]));
|
||||||
return (MPI_ERR_ARG);
|
return (MPI_ERR_ARG);
|
||||||
} /* switch */
|
} /* switch */
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount,
|
ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
int algorithm, int faninout, int segsize)
|
int algorithm, int faninout, int segsize)
|
||||||
{
|
{
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
"coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||||
algorithm, faninout, segsize));
|
algorithm, faninout, segsize));
|
||||||
|
|
||||||
switch (algorithm) {
|
switch (algorithm) {
|
||||||
case (0):
|
case (0):
|
||||||
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (1):
|
case (1):
|
||||||
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (2):
|
case (2):
|
||||||
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (3):
|
case (3):
|
||||||
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module,
|
root, comm, module,
|
||||||
segsize);
|
segsize);
|
||||||
|
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
"coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
algorithm,
|
algorithm,
|
||||||
ompi_coll_tuned_forced_max_algorithms[GATHER]));
|
ompi_coll_tuned_forced_max_algorithms[GATHER]));
|
||||||
return (MPI_ERR_ARG);
|
return (MPI_ERR_ARG);
|
||||||
} /* switch */
|
} /* switch */
|
||||||
}
|
}
|
||||||
|
@ -42,8 +42,6 @@ static int tuned_module_enable(mca_coll_base_module_t *module,
|
|||||||
int ompi_coll_tuned_init_query(bool enable_progress_threads,
|
int ompi_coll_tuned_init_query(bool enable_progress_threads,
|
||||||
bool enable_mpi_threads)
|
bool enable_mpi_threads)
|
||||||
{
|
{
|
||||||
/* Nothing to do */
|
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -483,9 +483,7 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf,
|
|||||||
uint32_t segsize,
|
uint32_t segsize,
|
||||||
int max_outstanding_reqs )
|
int max_outstanding_reqs )
|
||||||
{
|
{
|
||||||
int ret;
|
int ret, rank, size, io_root, segcount = count;
|
||||||
int rank, size, io_root;
|
|
||||||
int segcount = count;
|
|
||||||
void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL;
|
void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL;
|
||||||
size_t typelng;
|
size_t typelng;
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
@ -603,10 +601,8 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
|||||||
{
|
{
|
||||||
int i, rank, err, size;
|
int i, rank, err, size;
|
||||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||||
char *free_buffer = NULL;
|
char *free_buffer = NULL, *pml_buffer = NULL;
|
||||||
char *pml_buffer = NULL;
|
char *inplace_temp = NULL, *inbuf;
|
||||||
char *inplace_temp = NULL;
|
|
||||||
char *inbuf;
|
|
||||||
|
|
||||||
/* Initialize */
|
/* Initialize */
|
||||||
|
|
||||||
|
@ -43,16 +43,11 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int err, i;
|
int err, i, rank, size, total_count, *displs = NULL;
|
||||||
int rank, size;
|
|
||||||
int total_count;
|
|
||||||
int *displs = NULL;
|
|
||||||
char *tmprbuf = NULL;
|
|
||||||
char *tmprbuf_free = NULL;
|
|
||||||
|
|
||||||
const int root = 0;
|
const int root = 0;
|
||||||
|
char *tmprbuf = NULL, *tmprbuf_free = NULL;
|
||||||
|
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
@ -64,42 +59,42 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf,
|
|||||||
/* Reduce to rank 0 (root) and scatterv */
|
/* Reduce to rank 0 (root) and scatterv */
|
||||||
tmprbuf = (char*) rbuf;
|
tmprbuf = (char*) rbuf;
|
||||||
if (MPI_IN_PLACE == sbuf) {
|
if (MPI_IN_PLACE == sbuf) {
|
||||||
/* rbuf on root (0) is big enough to hold whole data */
|
/* rbuf on root (0) is big enough to hold whole data */
|
||||||
if (root == rank) {
|
if (root == rank) {
|
||||||
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count,
|
err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count,
|
||||||
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
|
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
|
||||||
} else {
|
} else {
|
||||||
err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count,
|
err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count,
|
||||||
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
|
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (root == rank) {
|
if (root == rank) {
|
||||||
/* We must allocate temporary receive buffer on root to ensure that
|
/* We must allocate temporary receive buffer on root to ensure that
|
||||||
rbuf is big enough */
|
rbuf is big enough */
|
||||||
ptrdiff_t lb, extent, tlb, textent;
|
ptrdiff_t lb, extent, tlb, textent;
|
||||||
|
|
||||||
ompi_datatype_get_extent(dtype, &lb, &extent);
|
ompi_datatype_get_extent(dtype, &lb, &extent);
|
||||||
ompi_datatype_get_true_extent(dtype, &tlb, &textent);
|
ompi_datatype_get_true_extent(dtype, &tlb, &textent);
|
||||||
|
|
||||||
tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent);
|
tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent);
|
||||||
tmprbuf = tmprbuf_free - lb;
|
tmprbuf = tmprbuf_free - lb;
|
||||||
}
|
}
|
||||||
err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count,
|
err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count,
|
||||||
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
|
dtype, op, root, comm, comm->c_coll.coll_reduce_module);
|
||||||
}
|
}
|
||||||
if (MPI_SUCCESS != err) {
|
if (MPI_SUCCESS != err) {
|
||||||
if (NULL != tmprbuf_free) free(tmprbuf_free);
|
if (NULL != tmprbuf_free) free(tmprbuf_free);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
displs = (int*) malloc(size * sizeof(int));
|
displs = (int*) malloc(size * sizeof(int));
|
||||||
displs[0] = 0;
|
displs[0] = 0;
|
||||||
for (i = 1; i < size; i++) {
|
for (i = 1; i < size; i++) {
|
||||||
displs[i] = displs[i-1] + rcounts[i-1];
|
displs[i] = displs[i-1] + rcounts[i-1];
|
||||||
}
|
}
|
||||||
err = comm->c_coll.coll_scatterv (tmprbuf, rcounts, displs, dtype,
|
err = comm->c_coll.coll_scatterv (tmprbuf, rcounts, displs, dtype,
|
||||||
rbuf, rcounts[rank], dtype,
|
rbuf, rcounts[rank], dtype,
|
||||||
root, comm, comm->c_coll.coll_scatterv_module);
|
root, comm, comm->c_coll.coll_scatterv_module);
|
||||||
free(displs);
|
free(displs);
|
||||||
if (NULL != tmprbuf_free) free(tmprbuf_free);
|
if (NULL != tmprbuf_free) free(tmprbuf_free);
|
||||||
|
|
||||||
@ -130,11 +125,10 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i, rank, size, count, err = OMPI_SUCCESS;
|
int i, rank, size, count, err = OMPI_SUCCESS;
|
||||||
int tmp_size, remain = 0, tmp_rank;
|
int tmp_size, remain = 0, tmp_rank, *disps = NULL;
|
||||||
int *disps = NULL;
|
|
||||||
ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
|
ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
|
||||||
char *recv_buf = NULL, *recv_buf_free = NULL;
|
char *recv_buf = NULL, *recv_buf_free = NULL;
|
||||||
char *result_buf = NULL, *result_buf_free = NULL;
|
char *result_buf = NULL, *result_buf_free = NULL;
|
||||||
@ -151,14 +145,14 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
|||||||
|
|
||||||
disps[0] = 0;
|
disps[0] = 0;
|
||||||
for (i = 0; i < (size - 1); ++i) {
|
for (i = 0; i < (size - 1); ++i) {
|
||||||
disps[i + 1] = disps[i] + rcounts[i];
|
disps[i + 1] = disps[i] + rcounts[i];
|
||||||
}
|
}
|
||||||
count = disps[size - 1] + rcounts[size - 1];
|
count = disps[size - 1] + rcounts[size - 1];
|
||||||
|
|
||||||
/* short cut the trivial case */
|
/* short cut the trivial case */
|
||||||
if (0 == count) {
|
if (0 == count) {
|
||||||
free(disps);
|
free(disps);
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get datatype information */
|
/* get datatype information */
|
||||||
@ -168,15 +162,15 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
|||||||
|
|
||||||
/* Handle MPI_IN_PLACE */
|
/* Handle MPI_IN_PLACE */
|
||||||
if (MPI_IN_PLACE == sbuf) {
|
if (MPI_IN_PLACE == sbuf) {
|
||||||
sbuf = rbuf;
|
sbuf = rbuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Allocate temporary receive buffer. */
|
/* Allocate temporary receive buffer. */
|
||||||
recv_buf_free = (char*) malloc(buf_size);
|
recv_buf_free = (char*) malloc(buf_size);
|
||||||
recv_buf = recv_buf_free - lb;
|
recv_buf = recv_buf_free - lb;
|
||||||
if (NULL == recv_buf_free) {
|
if (NULL == recv_buf_free) {
|
||||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* allocate temporary buffer for results */
|
/* allocate temporary buffer for results */
|
||||||
@ -198,186 +192,186 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf,
|
|||||||
procs with an even rank send to rank + 1, leaving a power of
|
procs with an even rank send to rank + 1, leaving a power of
|
||||||
two procs to do the rest of the algorithm */
|
two procs to do the rest of the algorithm */
|
||||||
if (rank < 2 * remain) {
|
if (rank < 2 * remain) {
|
||||||
if ((rank & 1) == 0) {
|
if ((rank & 1) == 0) {
|
||||||
err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
|
err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||||
MCA_PML_BASE_SEND_STANDARD,
|
MCA_PML_BASE_SEND_STANDARD,
|
||||||
comm));
|
comm));
|
||||||
if (OMPI_SUCCESS != err) goto cleanup;
|
if (OMPI_SUCCESS != err) goto cleanup;
|
||||||
|
|
||||||
/* we don't participate from here on out */
|
/* we don't participate from here on out */
|
||||||
tmp_rank = -1;
|
tmp_rank = -1;
|
||||||
} else {
|
} else {
|
||||||
err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
|
err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||||
comm, MPI_STATUS_IGNORE));
|
comm, MPI_STATUS_IGNORE));
|
||||||
|
|
||||||
/* integrate their results into our temp results */
|
/* integrate their results into our temp results */
|
||||||
ompi_op_reduce(op, recv_buf, result_buf, count, dtype);
|
ompi_op_reduce(op, recv_buf, result_buf, count, dtype);
|
||||||
|
|
||||||
/* adjust rank to be the bottom "remain" ranks */
|
/* adjust rank to be the bottom "remain" ranks */
|
||||||
tmp_rank = rank / 2;
|
tmp_rank = rank / 2;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* just need to adjust rank to show that the bottom "even
|
/* just need to adjust rank to show that the bottom "even
|
||||||
remain" ranks dropped out */
|
remain" ranks dropped out */
|
||||||
tmp_rank = rank - remain;
|
tmp_rank = rank - remain;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* For ranks not kicked out by the above code, perform the
|
/* For ranks not kicked out by the above code, perform the
|
||||||
recursive halving */
|
recursive halving */
|
||||||
if (tmp_rank >= 0) {
|
if (tmp_rank >= 0) {
|
||||||
int *tmp_disps = NULL, *tmp_rcounts = NULL;
|
int *tmp_disps = NULL, *tmp_rcounts = NULL;
|
||||||
int mask, send_index, recv_index, last_index;
|
int mask, send_index, recv_index, last_index;
|
||||||
|
|
||||||
/* recalculate disps and rcounts to account for the
|
/* recalculate disps and rcounts to account for the
|
||||||
special "remainder" processes that are no longer doing
|
special "remainder" processes that are no longer doing
|
||||||
anything */
|
anything */
|
||||||
tmp_rcounts = (int*) malloc(tmp_size * sizeof(int));
|
tmp_rcounts = (int*) malloc(tmp_size * sizeof(int));
|
||||||
if (NULL == tmp_rcounts) {
|
if (NULL == tmp_rcounts) {
|
||||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
tmp_disps = (int*) malloc(tmp_size * sizeof(int));
|
tmp_disps = (int*) malloc(tmp_size * sizeof(int));
|
||||||
if (NULL == tmp_disps) {
|
if (NULL == tmp_disps) {
|
||||||
free(tmp_rcounts);
|
free(tmp_rcounts);
|
||||||
err = OMPI_ERR_OUT_OF_RESOURCE;
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0 ; i < tmp_size ; ++i) {
|
for (i = 0 ; i < tmp_size ; ++i) {
|
||||||
if (i < remain) {
|
if (i < remain) {
|
||||||
/* need to include old neighbor as well */
|
/* need to include old neighbor as well */
|
||||||
tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2];
|
tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2];
|
||||||
} else {
|
} else {
|
||||||
tmp_rcounts[i] = rcounts[i + remain];
|
tmp_rcounts[i] = rcounts[i + remain];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp_disps[0] = 0;
|
tmp_disps[0] = 0;
|
||||||
for (i = 0; i < tmp_size - 1; ++i) {
|
for (i = 0; i < tmp_size - 1; ++i) {
|
||||||
tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i];
|
tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
/* do the recursive halving communication. Don't use the
|
/* do the recursive halving communication. Don't use the
|
||||||
dimension information on the communicator because I
|
dimension information on the communicator because I
|
||||||
think the information is invalidated by our "shrinking"
|
think the information is invalidated by our "shrinking"
|
||||||
of the communicator */
|
of the communicator */
|
||||||
mask = tmp_size >> 1;
|
mask = tmp_size >> 1;
|
||||||
send_index = recv_index = 0;
|
send_index = recv_index = 0;
|
||||||
last_index = tmp_size;
|
last_index = tmp_size;
|
||||||
while (mask > 0) {
|
while (mask > 0) {
|
||||||
int tmp_peer, peer, send_count, recv_count;
|
int tmp_peer, peer, send_count, recv_count;
|
||||||
struct ompi_request_t *request;
|
struct ompi_request_t *request;
|
||||||
|
|
||||||
tmp_peer = tmp_rank ^ mask;
|
tmp_peer = tmp_rank ^ mask;
|
||||||
peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain;
|
peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain;
|
||||||
|
|
||||||
/* figure out if we're sending, receiving, or both */
|
/* figure out if we're sending, receiving, or both */
|
||||||
send_count = recv_count = 0;
|
send_count = recv_count = 0;
|
||||||
if (tmp_rank < tmp_peer) {
|
if (tmp_rank < tmp_peer) {
|
||||||
send_index = recv_index + mask;
|
send_index = recv_index + mask;
|
||||||
for (i = send_index ; i < last_index ; ++i) {
|
for (i = send_index ; i < last_index ; ++i) {
|
||||||
send_count += tmp_rcounts[i];
|
send_count += tmp_rcounts[i];
|
||||||
}
|
}
|
||||||
for (i = recv_index ; i < send_index ; ++i) {
|
for (i = recv_index ; i < send_index ; ++i) {
|
||||||
recv_count += tmp_rcounts[i];
|
recv_count += tmp_rcounts[i];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
recv_index = send_index + mask;
|
recv_index = send_index + mask;
|
||||||
for (i = send_index ; i < recv_index ; ++i) {
|
for (i = send_index ; i < recv_index ; ++i) {
|
||||||
send_count += tmp_rcounts[i];
|
send_count += tmp_rcounts[i];
|
||||||
}
|
}
|
||||||
for (i = recv_index ; i < last_index ; ++i) {
|
for (i = recv_index ; i < last_index ; ++i) {
|
||||||
recv_count += tmp_rcounts[i];
|
recv_count += tmp_rcounts[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* actual data transfer. Send from result_buf,
|
/* actual data transfer. Send from result_buf,
|
||||||
receive into recv_buf */
|
receive into recv_buf */
|
||||||
if (send_count > 0 && recv_count != 0) {
|
if (send_count > 0 && recv_count != 0) {
|
||||||
err = MCA_PML_CALL(irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
|
err = MCA_PML_CALL(irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
|
||||||
recv_count, dtype, peer,
|
recv_count, dtype, peer,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||||
comm, &request));
|
comm, &request));
|
||||||
if (OMPI_SUCCESS != err) {
|
if (OMPI_SUCCESS != err) {
|
||||||
free(tmp_rcounts);
|
free(tmp_rcounts);
|
||||||
free(tmp_disps);
|
free(tmp_disps);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (recv_count > 0 && send_count != 0) {
|
if (recv_count > 0 && send_count != 0) {
|
||||||
err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent,
|
err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent,
|
||||||
send_count, dtype, peer,
|
send_count, dtype, peer,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||||
MCA_PML_BASE_SEND_STANDARD,
|
MCA_PML_BASE_SEND_STANDARD,
|
||||||
comm));
|
comm));
|
||||||
if (OMPI_SUCCESS != err) {
|
if (OMPI_SUCCESS != err) {
|
||||||
free(tmp_rcounts);
|
free(tmp_rcounts);
|
||||||
free(tmp_disps);
|
free(tmp_disps);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (send_count > 0 && recv_count != 0) {
|
if (send_count > 0 && recv_count != 0) {
|
||||||
err = ompi_request_wait(&request, MPI_STATUS_IGNORE);
|
err = ompi_request_wait(&request, MPI_STATUS_IGNORE);
|
||||||
if (OMPI_SUCCESS != err) {
|
if (OMPI_SUCCESS != err) {
|
||||||
free(tmp_rcounts);
|
free(tmp_rcounts);
|
||||||
free(tmp_disps);
|
free(tmp_disps);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if we received something on this step, push it into
|
/* if we received something on this step, push it into
|
||||||
the results buffer */
|
the results buffer */
|
||||||
if (recv_count > 0) {
|
if (recv_count > 0) {
|
||||||
ompi_op_reduce(op,
|
ompi_op_reduce(op,
|
||||||
recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
|
recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
|
||||||
result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
|
result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
|
||||||
recv_count, dtype);
|
recv_count, dtype);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* update for next iteration */
|
/* update for next iteration */
|
||||||
send_index = recv_index;
|
send_index = recv_index;
|
||||||
last_index = recv_index + mask;
|
last_index = recv_index + mask;
|
||||||
mask >>= 1;
|
mask >>= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* copy local results from results buffer into real receive buffer */
|
/* copy local results from results buffer into real receive buffer */
|
||||||
if (0 != rcounts[rank]) {
|
if (0 != rcounts[rank]) {
|
||||||
err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
|
err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent,
|
||||||
rcounts[rank], dtype,
|
rcounts[rank], dtype,
|
||||||
rbuf, rcounts[rank], dtype);
|
rbuf, rcounts[rank], dtype);
|
||||||
if (OMPI_SUCCESS != err) {
|
if (OMPI_SUCCESS != err) {
|
||||||
free(tmp_rcounts);
|
free(tmp_rcounts);
|
||||||
free(tmp_disps);
|
free(tmp_disps);
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
free(tmp_rcounts);
|
free(tmp_rcounts);
|
||||||
free(tmp_disps);
|
free(tmp_disps);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Now fix up the non-power of two case, by having the odd
|
/* Now fix up the non-power of two case, by having the odd
|
||||||
procs send the even procs the proper results */
|
procs send the even procs the proper results */
|
||||||
if (rank < (2 * remain)) {
|
if (rank < (2 * remain)) {
|
||||||
if ((rank & 1) == 0) {
|
if ((rank & 1) == 0) {
|
||||||
if (rcounts[rank]) {
|
if (rcounts[rank]) {
|
||||||
err = MCA_PML_CALL(recv(rbuf, rcounts[rank], dtype, rank + 1,
|
err = MCA_PML_CALL(recv(rbuf, rcounts[rank], dtype, rank + 1,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||||
comm, MPI_STATUS_IGNORE));
|
comm, MPI_STATUS_IGNORE));
|
||||||
if (OMPI_SUCCESS != err) goto cleanup;
|
if (OMPI_SUCCESS != err) goto cleanup;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (rcounts[rank - 1]) {
|
if (rcounts[rank - 1]) {
|
||||||
err = MCA_PML_CALL(send(result_buf + disps[rank - 1] * extent,
|
err = MCA_PML_CALL(send(result_buf + disps[rank - 1] * extent,
|
||||||
rcounts[rank - 1], dtype, rank - 1,
|
rcounts[rank - 1], dtype, rank - 1,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||||
MCA_PML_BASE_SEND_STANDARD,
|
MCA_PML_BASE_SEND_STANDARD,
|
||||||
comm));
|
comm));
|
||||||
if (OMPI_SUCCESS != err) goto cleanup;
|
if (OMPI_SUCCESS != err) goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
@ -455,27 +449,22 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int ret, line;
|
int ret, line, rank, size, i, k, recv_from, send_to, total_count, max_block_count;
|
||||||
int rank, size, i, k, recv_from, send_to;
|
int inbi, *displs = NULL;
|
||||||
int total_count, max_block_count;
|
char *tmpsend = NULL, *tmprecv = NULL, *accumbuf = NULL, *accumbuf_free = NULL;
|
||||||
int inbi;
|
char *inbuf_free[2] = {NULL, NULL}, *inbuf[2] = {NULL, NULL};
|
||||||
int *displs = NULL;
|
|
||||||
size_t typelng;
|
|
||||||
char *tmpsend = NULL, *tmprecv = NULL;
|
|
||||||
char *inbuf_free[2] = {NULL, NULL};
|
|
||||||
char *inbuf[2] = {NULL, NULL};
|
|
||||||
char *accumbuf = NULL, *accumbuf_free = NULL;
|
|
||||||
ptrdiff_t true_lb, true_extent, lb, extent, max_real_segsize;
|
ptrdiff_t true_lb, true_extent, lb, extent, max_real_segsize;
|
||||||
ompi_request_t *reqs[2] = {NULL, NULL};
|
ompi_request_t *reqs[2] = {NULL, NULL};
|
||||||
|
size_t typelng;
|
||||||
|
|
||||||
size = ompi_comm_size(comm);
|
size = ompi_comm_size(comm);
|
||||||
rank = ompi_comm_rank(comm);
|
rank = ompi_comm_rank(comm);
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:reduce_scatter_intra_ring rank %d, size %d",
|
"coll:tuned:reduce_scatter_intra_ring rank %d, size %d",
|
||||||
rank, size));
|
rank, size));
|
||||||
|
|
||||||
/* Determine the maximum number of elements per node,
|
/* Determine the maximum number of elements per node,
|
||||||
corresponding block size, and displacements array.
|
corresponding block size, and displacements array.
|
||||||
@ -486,20 +475,20 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
|||||||
total_count = rcounts[0];
|
total_count = rcounts[0];
|
||||||
max_block_count = rcounts[0];
|
max_block_count = rcounts[0];
|
||||||
for (i = 1; i < size; i++) {
|
for (i = 1; i < size; i++) {
|
||||||
displs[i] = total_count;
|
displs[i] = total_count;
|
||||||
total_count += rcounts[i];
|
total_count += rcounts[i];
|
||||||
if (max_block_count < rcounts[i]) max_block_count = rcounts[i];
|
if (max_block_count < rcounts[i]) max_block_count = rcounts[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Special case for size == 1 */
|
/* Special case for size == 1 */
|
||||||
if (1 == size) {
|
if (1 == size) {
|
||||||
if (MPI_IN_PLACE != sbuf) {
|
if (MPI_IN_PLACE != sbuf) {
|
||||||
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
|
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
|
||||||
(char*)rbuf, (char*)sbuf);
|
(char*)rbuf, (char*)sbuf);
|
||||||
if (ret < 0) { line = __LINE__; goto error_hndl; }
|
if (ret < 0) { line = __LINE__; goto error_hndl; }
|
||||||
}
|
}
|
||||||
free(displs);
|
free(displs);
|
||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Allocate and initialize temporary buffers, we need:
|
/* Allocate and initialize temporary buffers, we need:
|
||||||
@ -524,9 +513,9 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
|||||||
if (NULL == inbuf_free[0]) { ret = -1; line = __LINE__; goto error_hndl; }
|
if (NULL == inbuf_free[0]) { ret = -1; line = __LINE__; goto error_hndl; }
|
||||||
inbuf[0] = inbuf_free[0] - lb;
|
inbuf[0] = inbuf_free[0] - lb;
|
||||||
if (size > 2) {
|
if (size > 2) {
|
||||||
inbuf_free[1] = (char*)malloc(max_real_segsize);
|
inbuf_free[1] = (char*)malloc(max_real_segsize);
|
||||||
if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; }
|
if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; }
|
||||||
inbuf[1] = inbuf_free[1] - lb;
|
inbuf[1] = inbuf_free[1] - lb;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Handle MPI_IN_PLACE for size > 1 */
|
/* Handle MPI_IN_PLACE for size > 1 */
|
||||||
@ -535,7 +524,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
|||||||
}
|
}
|
||||||
|
|
||||||
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
|
ret = ompi_datatype_copy_content_same_ddt(dtype, total_count,
|
||||||
accumbuf, (char*)sbuf);
|
accumbuf, (char*)sbuf);
|
||||||
if (ret < 0) { line = __LINE__; goto error_hndl; }
|
if (ret < 0) { line = __LINE__; goto error_hndl; }
|
||||||
|
|
||||||
/* Computation loop */
|
/* Computation loop */
|
||||||
@ -561,41 +550,41 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
|||||||
inbi = 0;
|
inbi = 0;
|
||||||
/* Initialize first receive from the neighbor on the left */
|
/* Initialize first receive from the neighbor on the left */
|
||||||
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
|
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
||||||
&reqs[inbi]));
|
&reqs[inbi]));
|
||||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||||
tmpsend = accumbuf + (ptrdiff_t)displs[recv_from] * extent;
|
tmpsend = accumbuf + (ptrdiff_t)displs[recv_from] * extent;
|
||||||
ret = MCA_PML_CALL(send(tmpsend, rcounts[recv_from], dtype, send_to,
|
ret = MCA_PML_CALL(send(tmpsend, rcounts[recv_from], dtype, send_to,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||||
|
|
||||||
for (k = 2; k < size; k++) {
|
for (k = 2; k < size; k++) {
|
||||||
const int prevblock = (rank + size - k) % size;
|
const int prevblock = (rank + size - k) % size;
|
||||||
|
|
||||||
inbi = inbi ^ 0x1;
|
inbi = inbi ^ 0x1;
|
||||||
|
|
||||||
/* Post irecv for the current block */
|
/* Post irecv for the current block */
|
||||||
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
|
ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm,
|
||||||
&reqs[inbi]));
|
&reqs[inbi]));
|
||||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||||
|
|
||||||
/* Wait on previous block to arrive */
|
/* Wait on previous block to arrive */
|
||||||
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
|
ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
|
||||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||||
|
|
||||||
/* Apply operation on previous block: result goes to rbuf
|
/* Apply operation on previous block: result goes to rbuf
|
||||||
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
|
rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
|
||||||
*/
|
*/
|
||||||
tmprecv = accumbuf + (ptrdiff_t)displs[prevblock] * extent;
|
tmprecv = accumbuf + (ptrdiff_t)displs[prevblock] * extent;
|
||||||
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, rcounts[prevblock], dtype);
|
ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, rcounts[prevblock], dtype);
|
||||||
|
|
||||||
/* send previous block to send_to */
|
/* send previous block to send_to */
|
||||||
ret = MCA_PML_CALL(send(tmprecv, rcounts[prevblock], dtype, send_to,
|
ret = MCA_PML_CALL(send(tmprecv, rcounts[prevblock], dtype, send_to,
|
||||||
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
MCA_COLL_BASE_TAG_REDUCE_SCATTER,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Wait on the last block to arrive */
|
/* Wait on the last block to arrive */
|
||||||
@ -620,7 +609,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts,
|
|||||||
|
|
||||||
error_hndl:
|
error_hndl:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n",
|
||||||
__FILE__, line, rank, ret));
|
__FILE__, line, rank, ret));
|
||||||
if (NULL != displs) free(displs);
|
if (NULL != displs) free(displs);
|
||||||
if (NULL != accumbuf_free) free(accumbuf_free);
|
if (NULL != accumbuf_free) free(accumbuf_free);
|
||||||
if (NULL != inbuf_free[0]) free(inbuf_free[0]);
|
if (NULL != inbuf_free[0]) free(inbuf_free[0]);
|
||||||
@ -652,10 +641,10 @@ int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_alg
|
|||||||
false, true, max_alg, NULL);
|
false, true, max_alg, NULL);
|
||||||
|
|
||||||
mca_param_indices->algorithm_param_index
|
mca_param_indices->algorithm_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"reduce_scatter_algorithm",
|
"reduce_scatter_algorithm",
|
||||||
"Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring",
|
"Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring",
|
||||||
false, false, 0, NULL);
|
false, false, 0, NULL);
|
||||||
if (mca_param_indices->algorithm_param_index < 0) {
|
if (mca_param_indices->algorithm_param_index < 0) {
|
||||||
return mca_param_indices->algorithm_param_index;
|
return mca_param_indices->algorithm_param_index;
|
||||||
}
|
}
|
||||||
@ -695,30 +684,30 @@ int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_alg
|
|||||||
|
|
||||||
int ompi_coll_tuned_reduce_scatter_intra_do_forced(void *sbuf, void* rbuf,
|
int ompi_coll_tuned_reduce_scatter_intra_do_forced(void *sbuf, void* rbuf,
|
||||||
int *rcounts,
|
int *rcounts,
|
||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced selected algorithm %d",
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced selected algorithm %d",
|
||||||
data->user_forced[REDUCESCATTER].algorithm));
|
data->user_forced[REDUCESCATTER].algorithm));
|
||||||
|
|
||||||
switch (data->user_forced[REDUCESCATTER].algorithm) {
|
switch (data->user_forced[REDUCESCATTER].algorithm) {
|
||||||
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
|
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
|
||||||
dtype, op, comm, module);
|
dtype, op, comm, module);
|
||||||
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
||||||
dtype, op, comm, module);
|
dtype, op, comm, module);
|
||||||
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||||
dtype, op, comm, module);
|
dtype, op, comm, module);
|
||||||
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
|
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
|
||||||
dtype, op, comm, module);
|
dtype, op, comm, module);
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
data->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
|
data->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
|
||||||
return (MPI_ERR_ARG);
|
return (MPI_ERR_ARG);
|
||||||
} /* switch */
|
} /* switch */
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -728,25 +717,25 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(void *sbuf, void* rbuf,
|
|||||||
struct ompi_datatype_t *dtype,
|
struct ompi_datatype_t *dtype,
|
||||||
struct ompi_op_t *op,
|
struct ompi_op_t *op,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
int algorithm, int faninout, int segsize)
|
int algorithm, int faninout, int segsize)
|
||||||
{
|
{
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||||
algorithm, faninout, segsize));
|
algorithm, faninout, segsize));
|
||||||
|
|
||||||
switch (algorithm) {
|
switch (algorithm) {
|
||||||
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
|
case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts,
|
||||||
dtype, op, comm, module);
|
dtype, op, comm, module);
|
||||||
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
||||||
dtype, op, comm, module);
|
dtype, op, comm, module);
|
||||||
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||||
dtype, op, comm, module);
|
dtype, op, comm, module);
|
||||||
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
|
case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts,
|
||||||
dtype, op, comm, module);
|
dtype, op, comm, module);
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
|
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER]));
|
||||||
return (MPI_ERR_ARG);
|
return (MPI_ERR_ARG);
|
||||||
} /* switch */
|
} /* switch */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,22 +32,15 @@
|
|||||||
|
|
||||||
int
|
int
|
||||||
ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
|
ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void *rbuf, int rcount,
|
void *rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int line = -1;
|
int line = -1, i, rank, vrank, size, total_send = 0, err;
|
||||||
int i;
|
char *ptmp, *tempbuf = NULL;
|
||||||
int rank;
|
|
||||||
int vrank;
|
|
||||||
int size;
|
|
||||||
int total_send = 0;
|
|
||||||
char *ptmp = NULL;
|
|
||||||
char *tempbuf = NULL;
|
|
||||||
int err;
|
|
||||||
ompi_coll_tree_t* bmtree;
|
ompi_coll_tree_t* bmtree;
|
||||||
MPI_Status status;
|
MPI_Status status;
|
||||||
MPI_Aint sextent, slb, strue_lb, strue_extent;
|
MPI_Aint sextent, slb, strue_lb, strue_extent;
|
||||||
@ -71,111 +64,109 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
|
|||||||
ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent);
|
ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent);
|
||||||
|
|
||||||
vrank = (rank - root + size) % size;
|
vrank = (rank - root + size) % size;
|
||||||
|
ptmp = (char *) rbuf; /* by default suppose leaf nodes, just use rbuf */
|
||||||
|
|
||||||
if (rank == root) {
|
if (rank == root) {
|
||||||
if (0 == root) {
|
if (0 == root) {
|
||||||
/* root on 0, just use the send buffer */
|
/* root on 0, just use the send buffer */
|
||||||
ptmp = (char *) sbuf;
|
ptmp = (char *) sbuf;
|
||||||
if (rbuf != MPI_IN_PLACE) {
|
if (rbuf != MPI_IN_PLACE) {
|
||||||
/* local copy to rbuf */
|
/* local copy to rbuf */
|
||||||
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype);
|
rbuf, rcount, rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* root is not on 0, allocate temp buffer for send */
|
/* root is not on 0, allocate temp buffer for send */
|
||||||
tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent);
|
tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent);
|
||||||
if (NULL == tempbuf) {
|
if (NULL == tempbuf) {
|
||||||
err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ptmp = tempbuf - slb;
|
ptmp = tempbuf - slb;
|
||||||
|
|
||||||
/* and rotate data so they will eventually in the right place */
|
/* and rotate data so they will eventually in the right place */
|
||||||
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root),
|
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root),
|
||||||
ptmp, (char *) sbuf + sextent * (ptrdiff_t)root * (ptrdiff_t)scount);
|
ptmp, (char *) sbuf + sextent * (ptrdiff_t)root * (ptrdiff_t)scount);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
|
|
||||||
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)root,
|
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)root,
|
||||||
ptmp + sextent * (ptrdiff_t)scount * (ptrdiff_t)(size - root), (char *)sbuf);
|
ptmp + sextent * (ptrdiff_t)scount * (ptrdiff_t)(size - root), (char *)sbuf);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
if (rbuf != MPI_IN_PLACE) {
|
if (rbuf != MPI_IN_PLACE) {
|
||||||
/* local copy to rbuf */
|
/* local copy to rbuf */
|
||||||
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
|
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
|
||||||
rbuf, rcount, rdtype);
|
rbuf, rcount, rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
total_send = scount;
|
total_send = scount;
|
||||||
} else if (!(vrank % 2)) {
|
} else if (!(vrank % 2)) {
|
||||||
/* non-root, non-leaf nodes, allocte temp buffer for recv
|
/* non-root, non-leaf nodes, allocte temp buffer for recv
|
||||||
* the most we need is rcount*size/2 */
|
* the most we need is rcount*size/2 */
|
||||||
tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent);
|
tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent);
|
||||||
if (NULL == tempbuf) {
|
if (NULL == tempbuf) {
|
||||||
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ptmp = tempbuf - rlb;
|
ptmp = tempbuf - rlb;
|
||||||
|
|
||||||
sdtype = rdtype;
|
sdtype = rdtype;
|
||||||
scount = rcount;
|
scount = rcount;
|
||||||
sextent = rextent;
|
sextent = rextent;
|
||||||
total_send = scount;
|
total_send = scount;
|
||||||
} else {
|
|
||||||
/* leaf nodes, just use rbuf */
|
|
||||||
ptmp = (char *) rbuf;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(vrank % 2)) {
|
if (!(vrank % 2)) {
|
||||||
if (rank != root) {
|
if (rank != root) {
|
||||||
/* recv from parent on non-root */
|
/* recv from parent on non-root */
|
||||||
err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)rcount * (ptrdiff_t)size, rdtype, bmtree->tree_prev,
|
err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)rcount * (ptrdiff_t)size, rdtype, bmtree->tree_prev,
|
||||||
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
|
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
/* local copy to rbuf */
|
/* local copy to rbuf */
|
||||||
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
|
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
|
||||||
rbuf, rcount, rdtype);
|
rbuf, rcount, rdtype);
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
/* send to children on all non-leaf */
|
/* send to children on all non-leaf */
|
||||||
for (i = 0; i < bmtree->tree_nextsize; i++) {
|
for (i = 0; i < bmtree->tree_nextsize; i++) {
|
||||||
size_t mycount = 0;
|
size_t mycount = 0;
|
||||||
int vkid;
|
int vkid;
|
||||||
/* figure out how much data I have to send to this child */
|
/* figure out how much data I have to send to this child */
|
||||||
vkid = (bmtree->tree_next[i] - root + size) % size;
|
vkid = (bmtree->tree_next[i] - root + size) % size;
|
||||||
mycount = vkid - vrank;
|
mycount = vkid - vrank;
|
||||||
if( (int)mycount > (size - vkid) )
|
if( (int)mycount > (size - vkid) )
|
||||||
mycount = size - vkid;
|
mycount = size - vkid;
|
||||||
mycount *= scount;
|
mycount *= scount;
|
||||||
|
|
||||||
err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)total_send * sextent, mycount, sdtype,
|
err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)total_send * sextent, mycount, sdtype,
|
||||||
bmtree->tree_next[i],
|
bmtree->tree_next[i],
|
||||||
MCA_COLL_BASE_TAG_SCATTER,
|
MCA_COLL_BASE_TAG_SCATTER,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
|
|
||||||
total_send += mycount;
|
total_send += mycount;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NULL != tempbuf)
|
if (NULL != tempbuf)
|
||||||
free(tempbuf);
|
free(tempbuf);
|
||||||
} else {
|
} else {
|
||||||
/* recv from parent on leaf nodes */
|
/* recv from parent on leaf nodes */
|
||||||
err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, bmtree->tree_prev,
|
err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, bmtree->tree_prev,
|
||||||
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
|
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
|
||||||
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
|
||||||
}
|
}
|
||||||
|
|
||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
|
|
||||||
err_hndl:
|
err_hndl:
|
||||||
if (NULL != tempbuf)
|
if (NULL != tempbuf)
|
||||||
free(tempbuf);
|
free(tempbuf);
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d",
|
||||||
__FILE__, line, err, rank));
|
__FILE__, line, err, rank));
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -201,16 +192,16 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount,
|
|||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount,
|
ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void *rbuf, int rcount,
|
void *rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
int i, rank, size, err;
|
int i, rank, size, err;
|
||||||
char *ptmp;
|
|
||||||
ptrdiff_t lb, incr;
|
ptrdiff_t lb, incr;
|
||||||
|
char *ptmp;
|
||||||
|
|
||||||
/* Initialize */
|
/* Initialize */
|
||||||
|
|
||||||
@ -242,7 +233,7 @@ ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount,
|
|||||||
if (MPI_IN_PLACE != rbuf) {
|
if (MPI_IN_PLACE != rbuf) {
|
||||||
err =
|
err =
|
||||||
ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
|
ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
|
||||||
rdtype);
|
rdtype);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
|
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
|
||||||
@ -281,123 +272,123 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p
|
|||||||
ompi_coll_tuned_forced_max_algorithms[SCATTER] = max_alg;
|
ompi_coll_tuned_forced_max_algorithms[SCATTER] = max_alg;
|
||||||
|
|
||||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||||
"scatter_algorithm_count",
|
"scatter_algorithm_count",
|
||||||
"Number of scatter algorithms available",
|
"Number of scatter algorithms available",
|
||||||
false, true, max_alg, NULL);
|
false, true, max_alg, NULL);
|
||||||
|
|
||||||
mca_param_indices->algorithm_param_index
|
mca_param_indices->algorithm_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"scatter_algorithm",
|
"scatter_algorithm",
|
||||||
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
|
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
|
||||||
false, false, 0, NULL);
|
false, false, 0, NULL);
|
||||||
if (mca_param_indices->algorithm_param_index < 0) {
|
if (mca_param_indices->algorithm_param_index < 0) {
|
||||||
return mca_param_indices->algorithm_param_index;
|
return mca_param_indices->algorithm_param_index;
|
||||||
}
|
}
|
||||||
mca_base_param_lookup_int(mca_param_indices->algorithm_param_index,
|
mca_base_param_lookup_int(mca_param_indices->algorithm_param_index,
|
||||||
&(requested_alg));
|
&(requested_alg));
|
||||||
if( 0 > requested_alg || requested_alg > max_alg ) {
|
if( 0 > requested_alg || requested_alg > max_alg ) {
|
||||||
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
|
||||||
opal_output( 0, "Scatter algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
|
opal_output( 0, "Scatter algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n",
|
||||||
requested_alg, max_alg );
|
requested_alg, max_alg );
|
||||||
}
|
}
|
||||||
mca_base_param_set_int( mca_param_indices->algorithm_param_index, 0);
|
mca_base_param_set_int( mca_param_indices->algorithm_param_index, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
mca_param_indices->segsize_param_index
|
mca_param_indices->segsize_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"scatter_algorithm_segmentsize",
|
"scatter_algorithm_segmentsize",
|
||||||
"Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
"Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.",
|
||||||
false, false, 0, NULL);
|
false, false, 0, NULL);
|
||||||
|
|
||||||
mca_param_indices->tree_fanout_param_index
|
mca_param_indices->tree_fanout_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"scatter_algorithm_tree_fanout",
|
"scatter_algorithm_tree_fanout",
|
||||||
"Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
"Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.",
|
||||||
false, false,
|
false, false,
|
||||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||||
NULL);
|
NULL);
|
||||||
|
|
||||||
mca_param_indices->chain_fanout_param_index
|
mca_param_indices->chain_fanout_param_index
|
||||||
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
= mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||||
"scatter_algorithm_chain_fanout",
|
"scatter_algorithm_chain_fanout",
|
||||||
"Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
"Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.",
|
||||||
false, false,
|
false, false,
|
||||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||||
NULL);
|
NULL);
|
||||||
|
|
||||||
return (MPI_SUCCESS);
|
return (MPI_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
ompi_coll_tuned_scatter_intra_do_forced(void *sbuf, int scount,
|
ompi_coll_tuned_scatter_intra_do_forced(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module)
|
mca_coll_base_module_t *module)
|
||||||
{
|
{
|
||||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:scatter_intra_do_forced selected algorithm %d",
|
"coll:tuned:scatter_intra_do_forced selected algorithm %d",
|
||||||
data->user_forced[SCATTER].algorithm));
|
data->user_forced[SCATTER].algorithm));
|
||||||
|
|
||||||
switch (data->user_forced[SCATTER].algorithm) {
|
switch (data->user_forced[SCATTER].algorithm) {
|
||||||
case (0):
|
case (0):
|
||||||
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (1):
|
case (1):
|
||||||
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (2):
|
case (2):
|
||||||
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
|
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
"coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
data->user_forced[SCATTER].algorithm,
|
data->user_forced[SCATTER].algorithm,
|
||||||
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
|
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
|
||||||
return (MPI_ERR_ARG);
|
return (MPI_ERR_ARG);
|
||||||
} /* switch */
|
} /* switch */
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
ompi_coll_tuned_scatter_intra_do_this(void *sbuf, int scount,
|
ompi_coll_tuned_scatter_intra_do_this(void *sbuf, int scount,
|
||||||
struct ompi_datatype_t *sdtype,
|
struct ompi_datatype_t *sdtype,
|
||||||
void* rbuf, int rcount,
|
void* rbuf, int rcount,
|
||||||
struct ompi_datatype_t *rdtype,
|
struct ompi_datatype_t *rdtype,
|
||||||
int root,
|
int root,
|
||||||
struct ompi_communicator_t *comm,
|
struct ompi_communicator_t *comm,
|
||||||
mca_coll_base_module_t *module,
|
mca_coll_base_module_t *module,
|
||||||
int algorithm, int faninout, int segsize)
|
int algorithm, int faninout, int segsize)
|
||||||
{
|
{
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
"coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||||
algorithm, faninout, segsize));
|
algorithm, faninout, segsize));
|
||||||
|
|
||||||
switch (algorithm) {
|
switch (algorithm) {
|
||||||
case (0):
|
case (0):
|
||||||
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (1):
|
case (1):
|
||||||
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
case (2):
|
case (2):
|
||||||
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
|
return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
default:
|
default:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
algorithm,
|
algorithm,
|
||||||
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
|
ompi_coll_tuned_forced_max_algorithms[SCATTER]));
|
||||||
return (MPI_ERR_ARG);
|
return (MPI_ERR_ARG);
|
||||||
} /* switch */
|
} /* switch */
|
||||||
}
|
}
|
||||||
|
@ -77,13 +77,10 @@ ompi_coll_tuned_topo_build_tree( int fanout,
|
|||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
int root )
|
int root )
|
||||||
{
|
{
|
||||||
int rank, size;
|
int rank, size, schild, sparent, shiftedrank, i;
|
||||||
int schild, sparent;
|
|
||||||
int level; /* location of my rank in the tree structure of size */
|
int level; /* location of my rank in the tree structure of size */
|
||||||
int delta; /* number of nodes on my level */
|
int delta; /* number of nodes on my level */
|
||||||
int slimit; /* total number of nodes on levels above me */
|
int slimit; /* total number of nodes on levels above me */
|
||||||
int shiftedrank;
|
|
||||||
int i;
|
|
||||||
ompi_coll_tree_t* tree;
|
ompi_coll_tree_t* tree;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root));
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root));
|
||||||
@ -192,9 +189,7 @@ ompi_coll_tuned_topo_build_tree( int fanout,
|
|||||||
ompi_coll_tree_t*
|
ompi_coll_tree_t*
|
||||||
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
|
||||||
{
|
{
|
||||||
int rank, size;
|
int rank, size, myrank, rightsize, delta, parent, lchild, rchild;
|
||||||
int myrank, rightsize, delta;
|
|
||||||
int parent, lchild, rchild;
|
|
||||||
ompi_coll_tree_t* tree;
|
ompi_coll_tree_t* tree;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -329,14 +324,8 @@ ompi_coll_tree_t*
|
|||||||
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||||
int root )
|
int root )
|
||||||
{
|
{
|
||||||
int childs = 0;
|
int childs = 0, rank, size, mask = 1, index, remote, i;
|
||||||
int rank;
|
|
||||||
int size;
|
|
||||||
int mask = 1;
|
|
||||||
int index;
|
|
||||||
int remote;
|
|
||||||
ompi_coll_tree_t *bmtree;
|
ompi_coll_tree_t *bmtree;
|
||||||
int i;
|
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree rt %d", root));
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree rt %d", root));
|
||||||
|
|
||||||
@ -358,7 +347,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
|||||||
|
|
||||||
bmtree->tree_root = MPI_UNDEFINED;
|
bmtree->tree_root = MPI_UNDEFINED;
|
||||||
bmtree->tree_nextsize = MPI_UNDEFINED;
|
bmtree->tree_nextsize = MPI_UNDEFINED;
|
||||||
for(i=0;i<MAXTREEFANOUT;i++) {
|
for( i = 0;i < MAXTREEFANOUT; i++ ) {
|
||||||
bmtree->tree_next[i] = -1;
|
bmtree->tree_next[i] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -409,15 +398,10 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
|||||||
*/
|
*/
|
||||||
ompi_coll_tree_t*
|
ompi_coll_tree_t*
|
||||||
ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
||||||
int root )
|
int root )
|
||||||
{
|
{
|
||||||
int childs = 0;
|
int childs = 0, rank, vrank, size, mask = 1, remote, i;
|
||||||
int rank, vrank;
|
|
||||||
int size;
|
|
||||||
int mask = 1;
|
|
||||||
int remote;
|
|
||||||
ompi_coll_tree_t *bmtree;
|
ompi_coll_tree_t *bmtree;
|
||||||
int i;
|
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_in_order_bmtree rt %d", root));
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_in_order_bmtree rt %d", root));
|
||||||
|
|
||||||
@ -443,25 +427,25 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (root == rank) {
|
if (root == rank) {
|
||||||
bmtree->tree_prev = root;
|
bmtree->tree_prev = root;
|
||||||
}
|
}
|
||||||
|
|
||||||
while (mask < size) {
|
while (mask < size) {
|
||||||
remote = vrank ^ mask;
|
remote = vrank ^ mask;
|
||||||
if (remote < vrank) {
|
if (remote < vrank) {
|
||||||
bmtree->tree_prev = (remote + root) % size;
|
bmtree->tree_prev = (remote + root) % size;
|
||||||
break;
|
break;
|
||||||
} else if (remote < size) {
|
} else if (remote < size) {
|
||||||
bmtree->tree_next[childs] = (remote + root) % size;
|
bmtree->tree_next[childs] = (remote + root) % size;
|
||||||
childs++;
|
childs++;
|
||||||
if (childs==MAXTREEFANOUT) {
|
if (childs==MAXTREEFANOUT) {
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d",
|
"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d",
|
||||||
MAXTREEFANOUT, childs));
|
MAXTREEFANOUT, childs));
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mask <<= 1;
|
mask <<= 1;
|
||||||
}
|
}
|
||||||
bmtree->tree_nextsize = childs;
|
bmtree->tree_nextsize = childs;
|
||||||
bmtree->tree_root = root;
|
bmtree->tree_root = root;
|
||||||
@ -475,10 +459,7 @@ ompi_coll_tuned_topo_build_chain( int fanout,
|
|||||||
struct ompi_communicator_t* comm,
|
struct ompi_communicator_t* comm,
|
||||||
int root )
|
int root )
|
||||||
{
|
{
|
||||||
int rank, size;
|
int i, maxchainlen, mark, head, len, rank, size, srank /* shifted rank */;
|
||||||
int srank; /* shifted rank */
|
|
||||||
int i,maxchainlen;
|
|
||||||
int mark,head,len;
|
|
||||||
ompi_coll_tree_t *chain;
|
ompi_coll_tree_t *chain;
|
||||||
|
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain fo %d rt %d", fanout, root));
|
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain fo %d rt %d", fanout, root));
|
||||||
|
@ -52,7 +52,7 @@ ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdataty
|
|||||||
{
|
{
|
||||||
if ((dest == myid) && (source == myid)) {
|
if ((dest == myid) && (source == myid)) {
|
||||||
return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype,
|
return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype,
|
||||||
recvbuf, (int32_t) rcount, rdatatype);
|
recvbuf, (int32_t) rcount, rdatatype);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_sendrecv_actual (sendbuf, scount, sdatatype,
|
return ompi_coll_tuned_sendrecv_actual (sendbuf, scount, sdatatype,
|
||||||
dest, stag,
|
dest, stag,
|
||||||
@ -85,7 +85,7 @@ ompi_coll_tuned_sendrecv_localcompleted( void* sendbuf, size_t scount,
|
|||||||
{
|
{
|
||||||
if ((dest == myid) && (source == myid)) {
|
if ((dest == myid) && (source == myid)) {
|
||||||
return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype,
|
return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype,
|
||||||
recvbuf, (int32_t) rcount, rdatatype);
|
recvbuf, (int32_t) rcount, rdatatype);
|
||||||
}
|
}
|
||||||
return ompi_coll_tuned_sendrecv_actual_localcompleted (sendbuf, scount,
|
return ompi_coll_tuned_sendrecv_actual_localcompleted (sendbuf, scount,
|
||||||
sdatatype, dest,
|
sdatatype, dest,
|
||||||
@ -103,20 +103,20 @@ ompi_coll_tuned_isendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdtype
|
|||||||
void* recvbuf, size_t rcount, ompi_datatype_t* rdtype,
|
void* recvbuf, size_t rcount, ompi_datatype_t* rdtype,
|
||||||
int source, int rtag, ompi_request_t** rreq,
|
int source, int rtag, ompi_request_t** rreq,
|
||||||
struct ompi_communicator_t* comm ) {
|
struct ompi_communicator_t* comm ) {
|
||||||
int ret, line;
|
int ret, line;
|
||||||
|
|
||||||
ret = MCA_PML_CALL(irecv(recvbuf, rcount, rdtype, source, rtag, comm, rreq));
|
ret = MCA_PML_CALL(irecv(recvbuf, rcount, rdtype, source, rtag, comm, rreq));
|
||||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; }
|
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; }
|
||||||
|
|
||||||
ret = MCA_PML_CALL(isend(sendbuf, scount, sdtype, dest, stag,
|
ret = MCA_PML_CALL(isend(sendbuf, scount, sdtype, dest, stag,
|
||||||
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
|
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
|
||||||
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; }
|
if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; }
|
||||||
|
|
||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
error_handler:
|
error_handler:
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%d\tError occurred %d\n",
|
OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%d\tError occurred %d\n",
|
||||||
__FILE__, line, ret));
|
__FILE__, line, ret));
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user