diff --git a/ompi/mca/coll/tuned/coll_tuned_allgather.c b/ompi/mca/coll/tuned/coll_tuned_allgather.c index 3fd1257aa2..60fb3ce4e9 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allgather.c +++ b/ompi/mca/coll/tuned/coll_tuned_allgather.c @@ -86,127 +86,124 @@ int ompi_coll_tuned_allgather_intra_bruck(void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int line = -1; - int rank, size; - int sendto, recvfrom, distance, blockcount; - int err = 0; - ptrdiff_t slb, rlb, sext, rext; - char *tmpsend = NULL, *tmprecv = NULL; + int line = -1, rank, size, sendto, recvfrom, distance, blockcount, err = 0; + ptrdiff_t slb, rlb, sext, rext; + char *tmpsend = NULL, *tmprecv = NULL; - size = ompi_comm_size(comm); - rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_bruck rank %d", rank)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_bruck rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (sdtype, &slb, &sext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* Initialization step: - - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of - receive buffer, else - - if rank r != 0, copy r^th block from receive buffer to block 0. - */ - tmprecv = (char*) rbuf; - if (MPI_IN_PLACE != sbuf) { - tmpsend = (char*) sbuf; - err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* Initialization step: + - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of + receive buffer, else + - if rank r != 0, copy r^th block from receive buffer to block 0. + */ + tmprecv = (char*) rbuf; + if (MPI_IN_PLACE != sbuf) { + tmpsend = (char*) sbuf; + err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } else if (0 != rank) { - tmpsend = ((char*)rbuf) + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; - err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend); - if (err < 0) { line = __LINE__; goto err_hndl; } - } + } else if (0 != rank) { + tmpsend = ((char*)rbuf) + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; + err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmprecv, tmpsend); + if (err < 0) { line = __LINE__; goto err_hndl; } + } - /* Communication step: - At every step i, rank r: - - doubles the distance - - sends message which starts at begining of rbuf and has size - (blockcount * rcount) to rank (r - distance) - - receives message of size blockcount * rcount from rank (r + distance) - at location (rbuf + distance * rcount * rext) - - blockcount doubles until last step when only the remaining data is - exchanged. - */ - blockcount = 1; - tmpsend = (char*) rbuf; - for (distance = 1; distance < size; distance<<=1) { + /* Communication step: + At every step i, rank r: + - doubles the distance + - sends message which starts at begining of rbuf and has size + (blockcount * rcount) to rank (r - distance) + - receives message of size blockcount * rcount from rank (r + distance) + at location (rbuf + distance * rcount * rext) + - blockcount doubles until last step when only the remaining data is + exchanged. + */ + blockcount = 1; + tmpsend = (char*) rbuf; + for (distance = 1; distance < size; distance<<=1) { - recvfrom = (rank + distance) % size; - sendto = (rank - distance + size) % size; + recvfrom = (rank + distance) % size; + sendto = (rank - distance + size) % size; - tmprecv = tmpsend + (ptrdiff_t)distance * (ptrdiff_t)rcount * rext; + tmprecv = tmpsend + (ptrdiff_t)distance * (ptrdiff_t)rcount * rext; - if (distance <= (size >> 1)) { - blockcount = distance; - } else { - blockcount = size - distance; - } + if (distance <= (size >> 1)) { + blockcount = distance; + } else { + blockcount = size - distance; + } - /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, blockcount * rcount, rdtype, - sendto, MCA_COLL_BASE_TAG_ALLGATHER, - tmprecv, blockcount * rcount, rdtype, - recvfrom, MCA_COLL_BASE_TAG_ALLGATHER, - comm, MPI_STATUS_IGNORE, rank); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* Sendreceive */ + err = ompi_coll_tuned_sendrecv(tmpsend, blockcount * rcount, rdtype, + sendto, MCA_COLL_BASE_TAG_ALLGATHER, + tmprecv, blockcount * rcount, rdtype, + recvfrom, MCA_COLL_BASE_TAG_ALLGATHER, + comm, MPI_STATUS_IGNORE, rank); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } + } - /* Finalization step: - On all nodes except 0, data needs to be shifted locally: - - create temporary shift buffer, - see discussion in coll_basic_reduce.c about the size and begining - of temporary buffer. - - copy blocks [0 .. (size - rank - 1)] in rbuf to shift buffer - - move blocks [(size - rank) .. size] in rbuf to begining of rbuf - - copy blocks from shift buffer starting at block [rank] in rbuf. - */ - if (0 != rank) { - ptrdiff_t true_extent, true_lb; - char *free_buf = NULL, *shift_buf = NULL; + /* Finalization step: + On all nodes except 0, data needs to be shifted locally: + - create temporary shift buffer, + see discussion in coll_basic_reduce.c about the size and begining + of temporary buffer. + - copy blocks [0 .. (size - rank - 1)] in rbuf to shift buffer + - move blocks [(size - rank) .. size] in rbuf to begining of rbuf + - copy blocks from shift buffer starting at block [rank] in rbuf. + */ + if (0 != rank) { + ptrdiff_t true_extent, true_lb; + char *free_buf = NULL, *shift_buf = NULL; - err = ompi_datatype_get_true_extent(rdtype, &true_lb, &true_extent); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_true_extent(rdtype, &true_lb, &true_extent); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - free_buf = (char*) calloc(((true_extent + true_lb + - ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount - 1) * rext)), - sizeof(char)); - if (NULL == free_buf) { - line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl; - } - shift_buf = free_buf - rlb; + free_buf = (char*) calloc(((true_extent + true_lb + + ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount - 1) * rext)), + sizeof(char)); + if (NULL == free_buf) { + line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl; + } + shift_buf = free_buf - rlb; - tmpsend = (char*) rbuf; - err = ompi_datatype_copy_content_same_ddt(rdtype, ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount), - shift_buf, tmpsend); - if (err < 0) { line = __LINE__; goto err_hndl; } + tmpsend = (char*) rbuf; + err = ompi_datatype_copy_content_same_ddt(rdtype, ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount), + shift_buf, tmpsend); + if (err < 0) { line = __LINE__; goto err_hndl; } - tmprecv = (char*) rbuf; - tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount * rext; + tmprecv = (char*) rbuf; + tmpsend = (char*) rbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount * rext; - err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount, - tmprecv, tmpsend); - if (err < 0) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rank * (ptrdiff_t)rcount, + tmprecv, tmpsend); + if (err < 0) { line = __LINE__; goto err_hndl; } - tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; - err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount, - tmprecv, shift_buf); - if (err < 0) { line = __LINE__; goto err_hndl; } + tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; + err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)(size - rank) * (ptrdiff_t)rcount, + tmprecv, shift_buf); + if (err < 0) { line = __LINE__; goto err_hndl; } - free(free_buf); - } + free(free_buf); + } - return OMPI_SUCCESS; + return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", - __FILE__, line, err, rank)); - return err; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + return err; } /* @@ -260,91 +257,89 @@ ompi_coll_tuned_allgather_intra_recursivedoubling(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1; - int rank, size, pow2size; - int remote, distance, sendblocklocation; - int err = 0; - ptrdiff_t slb, rlb, sext, rext; - char *tmpsend = NULL, *tmprecv = NULL; + int line = -1, rank, size, pow2size, err; + int remote, distance, sendblocklocation; + ptrdiff_t slb, rlb, sext, rext; + char *tmpsend = NULL, *tmprecv = NULL; - size = ompi_comm_size(comm); - rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); - pow2size = opal_next_poweroftwo (size); - pow2size >>=1; + pow2size = opal_next_poweroftwo (size); + pow2size >>=1; - /* Current implementation only handles power-of-two number of processes. - If the function was called on non-power-of-two number of processes, - print warning and call bruck allgather algorithm with same parameters. + /* Current implementation only handles power-of-two number of processes. + If the function was called on non-power-of-two number of processes, + print warning and call bruck allgather algorithm with same parameters. */ - if (pow2size != size) { - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm", - size)); + if (pow2size != size) { + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_recursivedoubling WARNING: non-pow-2 size %d, switching to bruck algorithm", + size)); - return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - } + return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); + } - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_recursivedoubling rank %d, size %d", - rank, size)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_recursivedoubling rank %d, size %d", + rank, size)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (sdtype, &slb, &sext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* Initialization step: - - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of - receive buffer - */ - if (MPI_IN_PLACE != sbuf) { - tmpsend = (char*) sbuf; - tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; - err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* Initialization step: + - if send buffer is not MPI_IN_PLACE, copy send buffer to block 0 of + receive buffer + */ + if (MPI_IN_PLACE != sbuf) { + tmpsend = (char*) sbuf; + tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; + err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } + } - /* Communication step: - At every step i, rank r: - - exchanges message with rank remote = (r ^ 2^i). + /* Communication step: + At every step i, rank r: + - exchanges message with rank remote = (r ^ 2^i). - */ - sendblocklocation = rank; - for (distance = 0x1; distance < size; distance<<=1) { - remote = rank ^ distance; + */ + sendblocklocation = rank; + for (distance = 0x1; distance < size; distance<<=1) { + remote = rank ^ distance; - if (rank < remote) { - tmpsend = (char*)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext; - tmprecv = (char*)rbuf + (ptrdiff_t)(sendblocklocation + distance) * (ptrdiff_t)rcount * rext; - } else { - tmpsend = (char*)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext; - tmprecv = (char*)rbuf + (ptrdiff_t)(sendblocklocation - distance) * (ptrdiff_t)rcount * rext; - sendblocklocation -= distance; - } + if (rank < remote) { + tmpsend = (char*)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext; + tmprecv = (char*)rbuf + (ptrdiff_t)(sendblocklocation + distance) * (ptrdiff_t)rcount * rext; + } else { + tmpsend = (char*)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext; + tmprecv = (char*)rbuf + (ptrdiff_t)(sendblocklocation - distance) * (ptrdiff_t)rcount * rext; + sendblocklocation -= distance; + } - /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, - remote, MCA_COLL_BASE_TAG_ALLGATHER, - tmprecv, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, - remote, MCA_COLL_BASE_TAG_ALLGATHER, - comm, MPI_STATUS_IGNORE, rank); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* Sendreceive */ + err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, + remote, MCA_COLL_BASE_TAG_ALLGATHER, + tmprecv, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, + remote, MCA_COLL_BASE_TAG_ALLGATHER, + comm, MPI_STATUS_IGNORE, rank); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } + } - return OMPI_SUCCESS; + return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", - __FILE__, line, err, rank)); - return err; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + return err; } @@ -369,72 +364,69 @@ int ompi_coll_tuned_allgather_intra_ring(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1; - int rank, size; - int sendto, recvfrom, i, recvdatafrom, senddatafrom; - int err = 0; - ptrdiff_t slb, rlb, sext, rext; - char *tmpsend = NULL, *tmprecv = NULL; + int line = -1, rank, size, err, sendto, recvfrom, i, recvdatafrom, senddatafrom; + ptrdiff_t slb, rlb, sext, rext; + char *tmpsend = NULL, *tmprecv = NULL; - size = ompi_comm_size(comm); - rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_ring rank %d", rank)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_ring rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (sdtype, &slb, &sext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* Initialization step: - - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block - of receive buffer - */ - tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; - if (MPI_IN_PLACE != sbuf) { - tmpsend = (char*) sbuf; - err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } + /* Initialization step: + - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block + of receive buffer + */ + tmprecv = (char*) rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; + if (MPI_IN_PLACE != sbuf) { + tmpsend = (char*) sbuf; + err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } - /* Communication step: - At every step i: 0 .. (P-1), rank r: - - receives message from [(r - 1 + size) % size] containing data from rank - [(r - i - 1 + size) % size] - - sends message to rank [(r + 1) % size] containing data from rank - [(r - i + size) % size] - - sends message which starts at begining of rbuf and has size - */ - sendto = (rank + 1) % size; - recvfrom = (rank - 1 + size) % size; + /* Communication step: + At every step i: 0 .. (P-1), rank r: + - receives message from [(r - 1 + size) % size] containing data from rank + [(r - i - 1 + size) % size] + - sends message to rank [(r + 1) % size] containing data from rank + [(r - i + size) % size] + - sends message which starts at begining of rbuf and has size + */ + sendto = (rank + 1) % size; + recvfrom = (rank - 1 + size) % size; - for (i = 0; i < size - 1; i++) { - recvdatafrom = (rank - i - 1 + size) % size; - senddatafrom = (rank - i + size) % size; + for (i = 0; i < size - 1; i++) { + recvdatafrom = (rank - i - 1 + size) % size; + senddatafrom = (rank - i + size) % size; - tmprecv = (char*)rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)rcount * rext; - tmpsend = (char*)rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)rcount * rext; + tmprecv = (char*)rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)rcount * rext; + tmpsend = (char*)rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)rcount * rext; - /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, sendto, - MCA_COLL_BASE_TAG_ALLGATHER, - tmprecv, rcount, rdtype, recvfrom, - MCA_COLL_BASE_TAG_ALLGATHER, - comm, MPI_STATUS_IGNORE, rank); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* Sendreceive */ + err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, sendto, + MCA_COLL_BASE_TAG_ALLGATHER, + tmprecv, rcount, rdtype, recvfrom, + MCA_COLL_BASE_TAG_ALLGATHER, + comm, MPI_STATUS_IGNORE, rank); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } + } - return OMPI_SUCCESS; + return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", - __FILE__, line, err, rank)); - return err; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + return err; } /* @@ -500,117 +492,114 @@ ompi_coll_tuned_allgather_intra_neighborexchange(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1; - int rank, size; - int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from; - int i, even_rank; - int err = 0; - ptrdiff_t slb, rlb, sext, rext; - char *tmpsend = NULL, *tmprecv = NULL; + int line = -1, rank, size, i, even_rank, err; + int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from; + ptrdiff_t slb, rlb, sext, rext; + char *tmpsend = NULL, *tmprecv = NULL; - size = ompi_comm_size(comm); - rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); - if (size % 2) { - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", - size)); - return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - } + if (size % 2) { + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", + size)); + return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); + } - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_neighborexchange rank %d", rank)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_neighborexchange rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (sdtype, &slb, &sext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* Initialization step: - - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block - of receive buffer - */ - tmprecv = (char*) rbuf + (ptrdiff_t)rank *(ptrdiff_t) rcount * rext; - if (MPI_IN_PLACE != sbuf) { - tmpsend = (char*) sbuf; - err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } + /* Initialization step: + - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block + of receive buffer + */ + tmprecv = (char*) rbuf + (ptrdiff_t)rank *(ptrdiff_t) rcount * rext; + if (MPI_IN_PLACE != sbuf) { + tmpsend = (char*) sbuf; + err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } - /* Determine neighbors, order in which blocks will arrive, etc. */ - even_rank = !(rank % 2); - if (even_rank) { - neighbor[0] = (rank + 1) % size; - neighbor[1] = (rank - 1 + size) % size; - recv_data_from[0] = rank; - recv_data_from[1] = rank; - offset_at_step[0] = (+2); - offset_at_step[1] = (-2); - } else { - neighbor[0] = (rank - 1 + size) % size; - neighbor[1] = (rank + 1) % size; - recv_data_from[0] = neighbor[0]; - recv_data_from[1] = neighbor[0]; - offset_at_step[0] = (-2); - offset_at_step[1] = (+2); - } + /* Determine neighbors, order in which blocks will arrive, etc. */ + even_rank = !(rank % 2); + if (even_rank) { + neighbor[0] = (rank + 1) % size; + neighbor[1] = (rank - 1 + size) % size; + recv_data_from[0] = rank; + recv_data_from[1] = rank; + offset_at_step[0] = (+2); + offset_at_step[1] = (-2); + } else { + neighbor[0] = (rank - 1 + size) % size; + neighbor[1] = (rank + 1) % size; + recv_data_from[0] = neighbor[0]; + recv_data_from[1] = neighbor[0]; + offset_at_step[0] = (-2); + offset_at_step[1] = (+2); + } - /* Communication loop: - - First step is special: exchange a single block with neighbor[0]. - - Rest of the steps: - update recv_data_from according to offset, and - exchange two blocks with appropriate neighbor. - the send location becomes previous receve location. - */ - tmprecv = (char*)rbuf + (ptrdiff_t)neighbor[0] * (ptrdiff_t)rcount * rext; - tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; - /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, neighbor[0], - MCA_COLL_BASE_TAG_ALLGATHER, - tmprecv, rcount, rdtype, neighbor[0], - MCA_COLL_BASE_TAG_ALLGATHER, - comm, MPI_STATUS_IGNORE, rank); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* Communication loop: + - First step is special: exchange a single block with neighbor[0]. + - Rest of the steps: + update recv_data_from according to offset, and + exchange two blocks with appropriate neighbor. + the send location becomes previous receve location. + */ + tmprecv = (char*)rbuf + (ptrdiff_t)neighbor[0] * (ptrdiff_t)rcount * rext; + tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; + /* Sendreceive */ + err = ompi_coll_tuned_sendrecv(tmpsend, rcount, rdtype, neighbor[0], + MCA_COLL_BASE_TAG_ALLGATHER, + tmprecv, rcount, rdtype, neighbor[0], + MCA_COLL_BASE_TAG_ALLGATHER, + comm, MPI_STATUS_IGNORE, rank); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* Determine initial sending location */ - if (even_rank) { - send_data_from = rank; - } else { - send_data_from = recv_data_from[0]; - } + /* Determine initial sending location */ + if (even_rank) { + send_data_from = rank; + } else { + send_data_from = recv_data_from[0]; + } - for (i = 1; i < (size / 2); i++) { - const int i_parity = i % 2; - recv_data_from[i_parity] = - (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size; + for (i = 1; i < (size / 2); i++) { + const int i_parity = i % 2; + recv_data_from[i_parity] = + (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size; - tmprecv = (char*)rbuf + (ptrdiff_t)recv_data_from[i_parity] * (ptrdiff_t)rcount * rext; - tmpsend = (char*)rbuf + (ptrdiff_t)send_data_from * rcount * rext; + tmprecv = (char*)rbuf + (ptrdiff_t)recv_data_from[i_parity] * (ptrdiff_t)rcount * rext; + tmpsend = (char*)rbuf + (ptrdiff_t)send_data_from * rcount * rext; - /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype, - neighbor[i_parity], - MCA_COLL_BASE_TAG_ALLGATHER, - tmprecv, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype, - neighbor[i_parity], - MCA_COLL_BASE_TAG_ALLGATHER, - comm, MPI_STATUS_IGNORE, rank); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* Sendreceive */ + err = ompi_coll_tuned_sendrecv(tmpsend, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype, + neighbor[i_parity], + MCA_COLL_BASE_TAG_ALLGATHER, + tmprecv, (ptrdiff_t)2 * (ptrdiff_t)rcount, rdtype, + neighbor[i_parity], + MCA_COLL_BASE_TAG_ALLGATHER, + comm, MPI_STATUS_IGNORE, rank); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - send_data_from = recv_data_from[i_parity]; - } + send_data_from = recv_data_from[i_parity]; + } - return OMPI_SUCCESS; + return OMPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", - __FILE__, line, err, rank)); - return err; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + return err; } @@ -619,59 +608,57 @@ int ompi_coll_tuned_allgather_intra_two_procs(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1, err = 0; - int rank; - int remote; - char *tmpsend = NULL, *tmprecv = NULL; - ptrdiff_t sext, rext, lb; + int line = -1, err, rank, remote; + char *tmpsend = NULL, *tmprecv = NULL; + ptrdiff_t sext, rext, lb; - rank = ompi_comm_rank(comm); + rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_allgather_intra_two_procs rank %d", rank)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_allgather_intra_two_procs rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &lb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (sdtype, &lb, &sext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &lb, &rext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (rdtype, &lb, &rext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* Exchange data: - - compute source and destinations - - send receive data + /* Exchange data: + - compute source and destinations + - send receive data */ - remote = rank ^ 0x1; + remote = rank ^ 0x1; - tmpsend = (char*)sbuf; - if (MPI_IN_PLACE == sbuf) { - tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; - scount = rcount; - sdtype = rdtype; - } - tmprecv = (char*)rbuf + (ptrdiff_t)remote * (ptrdiff_t)rcount * rext; + tmpsend = (char*)sbuf; + if (MPI_IN_PLACE == sbuf) { + tmpsend = (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext; + scount = rcount; + sdtype = rdtype; + } + tmprecv = (char*)rbuf + (ptrdiff_t)remote * (ptrdiff_t)rcount * rext; - err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote, - MCA_COLL_BASE_TAG_ALLGATHER, - tmprecv, rcount, rdtype, remote, - MCA_COLL_BASE_TAG_ALLGATHER, - comm, MPI_STATUS_IGNORE, rank); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_coll_tuned_sendrecv(tmpsend, scount, sdtype, remote, + MCA_COLL_BASE_TAG_ALLGATHER, + tmprecv, rcount, rdtype, remote, + MCA_COLL_BASE_TAG_ALLGATHER, + comm, MPI_STATUS_IGNORE, rank); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* Place your data in correct location if necessary */ - if (MPI_IN_PLACE != sbuf) { - err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype, - (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext, rcount, rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } + /* Place your data in correct location if necessary */ + if (MPI_IN_PLACE != sbuf) { + err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype, + (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * rext, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } - return MPI_SUCCESS; + return MPI_SUCCESS; err_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", - __FILE__, line, err, rank)); - return err; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + return err; } @@ -703,7 +690,7 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { int err; ptrdiff_t lb, extent; @@ -714,17 +701,17 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount, as sbuf and avoid using a temporary buffer if gather is implemented correctly */ if (MPI_IN_PLACE == sbuf && 0 != ompi_comm_rank(comm)) { - ompi_datatype_get_extent(rdtype, &lb, &extent); - sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount); - sdtype = rdtype; - scount = rcount; + ompi_datatype_get_extent(rdtype, &lb, &extent); + sbuf = ((char*) rbuf) + (ompi_comm_rank(comm) * extent * rcount); + sdtype = rdtype; + scount = rcount; } /* Gather and broadcast. */ err = comm->c_coll.coll_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - 0, comm, comm->c_coll.coll_gather_module); + rbuf, rcount, rdtype, + 0, comm, comm->c_coll.coll_gather_module); if (MPI_SUCCESS == err) { size_t length = (ptrdiff_t)rcount * ompi_comm_size(comm); if( length < (size_t)INT_MAX ) { @@ -761,54 +748,54 @@ ompi_coll_tuned_allgather_intra_basic_linear(void *sbuf, int scount, int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices) { - int max_alg = 6, requested_alg; + int max_alg = 6, requested_alg; - ompi_coll_tuned_forced_max_algorithms[ALLGATHER] = max_alg; + ompi_coll_tuned_forced_max_algorithms[ALLGATHER] = max_alg; - mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm_count", - "Number of allgather algorithms available", - false, true, max_alg, NULL); + mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version, + "allgather_algorithm_count", + "Number of allgather algorithms available", + false, true, max_alg, NULL); mca_param_indices->algorithm_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm", - "Which allgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only.", - false, false, 0, NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "allgather_algorithm", + "Which allgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only.", + false, false, 0, NULL); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; } mca_base_param_lookup_int(mca_param_indices->algorithm_param_index, &(requested_alg)); if( 0 > requested_alg || requested_alg > max_alg ) { - if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) { - opal_output( 0, "Allgather algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n", - requested_alg, max_alg ); - } - mca_base_param_set_int( mca_param_indices->algorithm_param_index, 0); + if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) { + opal_output( 0, "Allgather algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n", + requested_alg, max_alg ); + } + mca_base_param_set_int( mca_param_indices->algorithm_param_index, 0); } mca_param_indices->segsize_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm_segmentsize", - "Segment size in bytes used by default for allgather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.", - false, false, 0, NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "allgather_algorithm_segmentsize", + "Segment size in bytes used by default for allgather algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.", + false, false, 0, NULL); mca_param_indices->tree_fanout_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm_tree_fanout", - "Fanout for n-tree used for allgather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.", - false, false, - ompi_coll_tuned_init_tree_fanout, /* get system wide default */ - NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "allgather_algorithm_tree_fanout", + "Fanout for n-tree used for allgather algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.", + false, false, + ompi_coll_tuned_init_tree_fanout, /* get system wide default */ + NULL); mca_param_indices->chain_fanout_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "allgather_algorithm_chain_fanout", - "Fanout for chains used for allgather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.", - false, false, - ompi_coll_tuned_init_chain_fanout, /* get system wide default */ - NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "allgather_algorithm_chain_fanout", + "Fanout for chains used for allgather algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.", + false, false, + ompi_coll_tuned_init_chain_fanout, /* get system wide default */ + NULL); return (MPI_SUCCESS); } @@ -818,101 +805,101 @@ int ompi_coll_tuned_allgather_intra_do_forced(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_do_forced selected algorithm %d", - data->user_forced[ALLGATHER].algorithm)); + "coll:tuned:allgather_intra_do_forced selected algorithm %d", + data->user_forced[ALLGATHER].algorithm)); switch (data->user_forced[ALLGATHER].algorithm) { case (0): - return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); + return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); case (1): - return ompi_coll_tuned_allgather_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); + return ompi_coll_tuned_allgather_intra_basic_linear (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); case (2): - return ompi_coll_tuned_allgather_intra_bruck (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); + return ompi_coll_tuned_allgather_intra_bruck (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); case (3): - return ompi_coll_tuned_allgather_intra_recursivedoubling (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); + return ompi_coll_tuned_allgather_intra_recursivedoubling (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); case (4): - return ompi_coll_tuned_allgather_intra_ring (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); + return ompi_coll_tuned_allgather_intra_ring (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); case (5): - return ompi_coll_tuned_allgather_intra_neighborexchange (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); + return ompi_coll_tuned_allgather_intra_neighborexchange (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); case (6): - return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); + return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[ALLGATHER].algorithm, - ompi_coll_tuned_forced_max_algorithms[ALLGATHER])); - return (MPI_ERR_ARG); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", + data->user_forced[ALLGATHER].algorithm, + ompi_coll_tuned_forced_max_algorithms[ALLGATHER])); + return (MPI_ERR_ARG); } /* switch */ } int ompi_coll_tuned_allgather_intra_do_this(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module, + int algorithm, int faninout, int segsize) { - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d", + algorithm, faninout, segsize)); - switch (algorithm) { - case (0): - return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (1): - return ompi_coll_tuned_allgather_intra_basic_linear(sbuf, scount, sdtype, - rbuf, rcount, rdtype, + switch (algorithm) { + case (0): + return ompi_coll_tuned_allgather_intra_dec_fixed(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); + case (1): + return ompi_coll_tuned_allgather_intra_basic_linear(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); + case (2): + return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); + case (3): + return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); + case (4): + return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); + case (5): + return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module); + case (6): + return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype, + rbuf, rcount, rdtype, comm, module); - case (2): - return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (3): - return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (4): - return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (5): - return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - case (6): - return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, - ompi_coll_tuned_forced_max_algorithms[ALLGATHER])); - return (MPI_ERR_ARG); - } /* switch */ + default: + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, + ompi_coll_tuned_forced_max_algorithms[ALLGATHER])); + return (MPI_ERR_ARG); + } /* switch */ } diff --git a/ompi/mca/coll/tuned/coll_tuned_allgatherv.c b/ompi/mca/coll/tuned/coll_tuned_allgatherv.c index 845f078a9c..e28729abac 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allgatherv.c +++ b/ompi/mca/coll/tuned/coll_tuned_allgatherv.c @@ -90,114 +90,111 @@ int ompi_coll_tuned_allgatherv_intra_bruck(void *sbuf, int scount, int *rdispls, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1, err = 0; - int rank, size; - int sendto, recvfrom, distance, blockcount, i; - int *new_rcounts = NULL, *new_rdispls = NULL; - int *new_scounts = NULL, *new_sdispls = NULL; - ptrdiff_t slb, rlb, sext, rext; - char *tmpsend = NULL, *tmprecv = NULL; - struct ompi_datatype_t *new_rdtype, *new_sdtype; + int line = -1, err = 0, rank, size, sendto, recvfrom, distance, blockcount, i; + int *new_rcounts = NULL, *new_rdispls = NULL, *new_scounts = NULL, *new_sdispls = NULL; + ptrdiff_t slb, rlb, sext, rext; + char *tmpsend = NULL, *tmprecv = NULL; + struct ompi_datatype_t *new_rdtype, *new_sdtype; - size = ompi_comm_size(comm); - rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allgather_intra_bruck rank %d", rank)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allgather_intra_bruck rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (sdtype, &slb, &sext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_get_extent (rdtype, &rlb, &rext); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* Initialization step: - - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of - the receive buffer. - */ - tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext; - if (MPI_IN_PLACE != sbuf) { - tmpsend = (char*) sbuf; - err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, - tmprecv, rcounts[rank], rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* Initialization step: + - if send buffer is not MPI_IN_PLACE, copy send buffer to block rank of + the receive buffer. + */ + tmprecv = (char*) rbuf + (ptrdiff_t)rdispls[rank] * rext; + if (MPI_IN_PLACE != sbuf) { + tmpsend = (char*) sbuf; + err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, + tmprecv, rcounts[rank], rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } + } - /* Communication step: - At every step i, rank r: - - doubles the distance - - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i]) - to rank (r - distance) - - receives message of blockcount blocks, - (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from - rank (r + distance) - - blockcount doubles until the last step when only the remaining data is - exchanged. - */ - blockcount = 1; - tmpsend = (char*) rbuf; + /* Communication step: + At every step i, rank r: + - doubles the distance + - sends message with blockcount blocks, (rbuf[rank] .. rbuf[rank + 2^i]) + to rank (r - distance) + - receives message of blockcount blocks, + (rbuf[r + distance] ... rbuf[(r+distance) + 2^i]) from + rank (r + distance) + - blockcount doubles until the last step when only the remaining data is + exchanged. + */ + blockcount = 1; + tmpsend = (char*) rbuf; - new_rcounts = (int*) calloc(4*size, sizeof(int)); - if (NULL == new_rcounts) { err = -1; line = __LINE__; goto err_hndl; } - new_rdispls = new_rcounts + size; - new_scounts = new_rdispls + size; - new_sdispls = new_scounts + size; + new_rcounts = (int*) calloc(4*size, sizeof(int)); + if (NULL == new_rcounts) { err = -1; line = __LINE__; goto err_hndl; } + new_rdispls = new_rcounts + size; + new_scounts = new_rdispls + size; + new_sdispls = new_scounts + size; - for (distance = 1; distance < size; distance<<=1) { + for (distance = 1; distance < size; distance<<=1) { - recvfrom = (rank + distance) % size; - sendto = (rank - distance + size) % size; + recvfrom = (rank + distance) % size; + sendto = (rank - distance + size) % size; - if (distance <= (size >> 1)) { - blockcount = distance; - } else { - blockcount = size - distance; - } + if (distance <= (size >> 1)) { + blockcount = distance; + } else { + blockcount = size - distance; + } - /* create send and receive datatypes */ - for (i = 0; i < blockcount; i++) { - const int tmp_srank = (rank + i) % size; - const int tmp_rrank = (recvfrom + i) % size; - new_scounts[i] = rcounts[tmp_srank]; - new_sdispls[i] = rdispls[tmp_srank]; - new_rcounts[i] = rcounts[tmp_rrank]; - new_rdispls[i] = rdispls[tmp_rrank]; - } - err = ompi_datatype_create_indexed(blockcount, new_scounts, new_sdispls, - rdtype, &new_sdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_create_indexed(blockcount, new_rcounts, new_rdispls, - rdtype, &new_rdtype); + /* create send and receive datatypes */ + for (i = 0; i < blockcount; i++) { + const int tmp_srank = (rank + i) % size; + const int tmp_rrank = (recvfrom + i) % size; + new_scounts[i] = rcounts[tmp_srank]; + new_sdispls[i] = rdispls[tmp_srank]; + new_rcounts[i] = rcounts[tmp_rrank]; + new_rdispls[i] = rdispls[tmp_rrank]; + } + err = ompi_datatype_create_indexed(blockcount, new_scounts, new_sdispls, + rdtype, &new_sdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_create_indexed(blockcount, new_rcounts, new_rdispls, + rdtype, &new_rdtype); - err = ompi_datatype_commit(&new_sdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_commit(&new_rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_commit(&new_sdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_commit(&new_rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* Sendreceive */ - err = ompi_coll_tuned_sendrecv(rbuf, 1, new_sdtype, sendto, - MCA_COLL_BASE_TAG_ALLGATHERV, - rbuf, 1, new_rdtype, recvfrom, - MCA_COLL_BASE_TAG_ALLGATHERV, - comm, MPI_STATUS_IGNORE, rank); - ompi_datatype_destroy(&new_sdtype); - ompi_datatype_destroy(&new_rdtype); + /* Sendreceive */ + err = ompi_coll_tuned_sendrecv(rbuf, 1, new_sdtype, sendto, + MCA_COLL_BASE_TAG_ALLGATHERV, + rbuf, 1, new_rdtype, recvfrom, + MCA_COLL_BASE_TAG_ALLGATHERV, + comm, MPI_STATUS_IGNORE, rank); + ompi_datatype_destroy(&new_sdtype); + ompi_datatype_destroy(&new_rdtype); - } + } - free(new_rcounts); + free(new_rcounts); - return OMPI_SUCCESS; + return OMPI_SUCCESS; err_hndl: - if( NULL != new_rcounts ) free(new_rcounts); + if( NULL != new_rcounts ) free(new_rcounts); - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", - __FILE__, line, err, rank)); - return err; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", + __FILE__, line, err, rank)); + return err; } @@ -221,12 +218,9 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount, void* rbuf, int *rcounts, int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1; - int rank, size; - int sendto, recvfrom, i, recvdatafrom, senddatafrom; - int err = 0; + int line = -1, rank, size, sendto, recvfrom, i, recvdatafrom, senddatafrom, err = 0; ptrdiff_t slb, rlb, sext, rext; char *tmpsend = NULL, *tmprecv = NULL; @@ -250,7 +244,7 @@ int ompi_coll_tuned_allgatherv_intra_ring(void *sbuf, int scount, if (MPI_IN_PLACE != sbuf) { tmpsend = (char*) sbuf; err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, - tmprecv, rcounts[rank], rdtype); + tmprecv, rcounts[rank], rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } @@ -354,14 +348,11 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount, void* rbuf, int *rcounts, int *rdispls, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1; - int rank, size; + int line = -1, rank, size, i, even_rank, err = 0; int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from; int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2]; - int i, even_rank; - int err = 0; ptrdiff_t slb, rlb, sext, rext; char *tmpsend = NULL, *tmprecv = NULL; struct ompi_datatype_t *new_rdtype, *new_sdtype; @@ -396,7 +387,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount, if (MPI_IN_PLACE != sbuf) { tmpsend = (char*) sbuf; err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, - tmprecv, rcounts[rank], rdtype); + tmprecv, rcounts[rank], rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } @@ -458,7 +449,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount, new_sdispls[0] = rdispls[send_data_from]; new_sdispls[1] = rdispls[(send_data_from + 1)]; err = ompi_datatype_create_indexed(2, new_scounts, new_sdispls, rdtype, - &new_sdtype); + &new_sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = ompi_datatype_commit(&new_sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -468,7 +459,7 @@ ompi_coll_tuned_allgatherv_intra_neighborexchange(void *sbuf, int scount, new_rdispls[0] = rdispls[recv_data_from[i_parity]]; new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)]; err = ompi_datatype_create_indexed(2, new_rcounts, new_rdispls, rdtype, - &new_rdtype); + &new_rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = ompi_datatype_commit(&new_rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -505,11 +496,9 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount, int *rdispls, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1, err = 0; - int rank; - int remote; + int line = -1, err = 0, rank, remote; char *tmpsend = NULL, *tmprecv = NULL; ptrdiff_t sext, rext, lb; @@ -548,8 +537,8 @@ int ompi_coll_tuned_allgatherv_intra_two_procs(void *sbuf, int scount, /* Place your data in correct location if necessary */ if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_sndrcv((char*)sbuf, scount, sdtype, - (char*)rbuf + (ptrdiff_t)rdispls[rank] * rext, - rcounts[rank], rdtype); + (char*)rbuf + (ptrdiff_t)rdispls[rank] * rext, + rcounts[rank], rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } @@ -591,12 +580,10 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount, int *disps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int i, size, rank ; - int err; - MPI_Aint extent; - MPI_Aint lb; + int i, size, rank, err; + MPI_Aint extent, lb; char *send_buf = NULL; struct ompi_datatype_t *newtype, *send_type; @@ -655,7 +642,7 @@ ompi_coll_tuned_allgatherv_intra_basic_default(void *sbuf, int scount, } comm->c_coll.coll_bcast(rbuf, 1, newtype, 0, comm, - comm->c_coll.coll_bcast_module); + comm->c_coll.coll_bcast_module); ompi_datatype_destroy (&newtype); @@ -736,14 +723,14 @@ int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount, int *rdispls, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allgatherv_intra_do_forced selected algorithm %d", - data->user_forced[ALLGATHERV].algorithm)); + data->user_forced[ALLGATHERV].algorithm)); switch (data->user_forced[ALLGATHERV].algorithm) { case (0): @@ -756,7 +743,7 @@ int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount, comm, module); case (2): return ompi_coll_tuned_allgatherv_intra_bruck (sbuf, scount, sdtype, - rbuf, rcounts, rdispls, rdtype, + rbuf, rcounts, rdispls, rdtype, comm, module); case (3): return ompi_coll_tuned_allgatherv_intra_ring (sbuf, scount, sdtype, @@ -773,7 +760,7 @@ int ompi_coll_tuned_allgatherv_intra_do_forced(void *sbuf, int scount, default: OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allgatherv_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[ALLGATHERV].algorithm, + data->user_forced[ALLGATHERV].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLGATHERV])); return (MPI_ERR_ARG); } /* switch */ @@ -787,7 +774,7 @@ int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount, int *rdispls, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { @@ -807,11 +794,11 @@ int ompi_coll_tuned_allgatherv_intra_do_this(void *sbuf, int scount, case (2): return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, - comm, module); + comm, module); case (3): return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, - comm, module); + comm, module); case (4): return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce.c b/ompi/mca/coll/tuned/coll_tuned_allreduce.c index 17090e6b65..5f99e11a2e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allreduce.c +++ b/ompi/mca/coll/tuned/coll_tuned_allreduce.c @@ -48,10 +48,9 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int err; - int rank; + int err, rank; rank = ompi_comm_rank(comm); @@ -69,14 +68,14 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count } } else { err = comm->c_coll.coll_reduce (sbuf, rbuf, count, dtype, op, 0, - comm, comm->c_coll.coll_reduce_module); + comm, comm->c_coll.coll_reduce_module); } if (MPI_SUCCESS != err) { return err; } return comm->c_coll.coll_bcast (rbuf, count, dtype, 0, comm, - comm->c_coll.coll_bcast_module); + comm->c_coll.coll_bcast_module); } /* @@ -126,152 +125,151 @@ ompi_coll_tuned_allreduce_intra_recursivedoubling(void *sbuf, void *rbuf, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int ret, line; - int rank, size, adjsize, remote, distance; - int newrank, newremote, extra_ranks; - char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL; - ptrdiff_t true_lb, true_extent, lb, extent; - ompi_request_t *reqs[2] = {NULL, NULL}; + int ret, line, rank, size, adjsize, remote, distance; + int newrank, newremote, extra_ranks; + char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL; + ptrdiff_t true_lb, true_extent, lb, extent; + ompi_request_t *reqs[2] = {NULL, NULL}; - size = ompi_comm_size(comm); - rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allreduce_intra_recursivedoubling rank %d", rank)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allreduce_intra_recursivedoubling rank %d", rank)); - /* Special case for size == 1 */ - if (1 == size) { - if (MPI_IN_PLACE != sbuf) { - ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); - if (ret < 0) { line = __LINE__; goto error_hndl; } - } - return MPI_SUCCESS; - } + /* Special case for size == 1 */ + if (1 == size) { + if (MPI_IN_PLACE != sbuf) { + ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); + if (ret < 0) { line = __LINE__; goto error_hndl; } + } + return MPI_SUCCESS; + } - /* Allocate and initialize temporary send buffer */ - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Allocate and initialize temporary send buffer */ + ret = ompi_datatype_get_extent(dtype, &lb, &extent); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - inplacebuf = (char*) malloc(true_extent + (ptrdiff_t)(count - 1) * extent); - if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; } + inplacebuf = (char*) malloc(true_extent + (ptrdiff_t)(count - 1) * extent); + if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; } - if (MPI_IN_PLACE == sbuf) { - ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf); - if (ret < 0) { line = __LINE__; goto error_hndl; } - } else { - ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)sbuf); - if (ret < 0) { line = __LINE__; goto error_hndl; } - } + if (MPI_IN_PLACE == sbuf) { + ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf); + if (ret < 0) { line = __LINE__; goto error_hndl; } + } else { + ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)sbuf); + if (ret < 0) { line = __LINE__; goto error_hndl; } + } - tmpsend = (char*) inplacebuf; - tmprecv = (char*) rbuf; + tmpsend = (char*) inplacebuf; + tmprecv = (char*) rbuf; - /* Determine nearest power of two less than or equal to size */ - adjsize = opal_next_poweroftwo (size); - adjsize >>= 1; + /* Determine nearest power of two less than or equal to size */ + adjsize = opal_next_poweroftwo (size); + adjsize >>= 1; - /* Handle non-power-of-two case: - - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and - sets new rank to -1. - - Odd ranks less than 2 * extra_ranks receive data from (rank - 1), - apply appropriate operation, and set new rank to rank/2 - - Everyone else sets rank to rank - extra_ranks + /* Handle non-power-of-two case: + - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and + sets new rank to -1. + - Odd ranks less than 2 * extra_ranks receive data from (rank - 1), + apply appropriate operation, and set new rank to rank/2 + - Everyone else sets rank to rank - extra_ranks */ - extra_ranks = size - adjsize; - if (rank < (2 * extra_ranks)) { - if (0 == (rank % 2)) { - ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1), - MCA_COLL_BASE_TAG_ALLREDUCE, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - newrank = -1; - } else { - ret = MCA_PML_CALL(recv(tmprecv, count, dtype, (rank - 1), - MCA_COLL_BASE_TAG_ALLREDUCE, comm, - MPI_STATUS_IGNORE)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* tmpsend = tmprecv (op) tmpsend */ - ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); - newrank = rank >> 1; - } - } else { - newrank = rank - extra_ranks; - } + extra_ranks = size - adjsize; + if (rank < (2 * extra_ranks)) { + if (0 == (rank % 2)) { + ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1), + MCA_COLL_BASE_TAG_ALLREDUCE, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + newrank = -1; + } else { + ret = MCA_PML_CALL(recv(tmprecv, count, dtype, (rank - 1), + MCA_COLL_BASE_TAG_ALLREDUCE, comm, + MPI_STATUS_IGNORE)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* tmpsend = tmprecv (op) tmpsend */ + ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); + newrank = rank >> 1; + } + } else { + newrank = rank - extra_ranks; + } - /* Communication/Computation loop - - Exchange message with remote node. - - Perform appropriate operation taking in account order of operations: - result = value (op) result + /* Communication/Computation loop + - Exchange message with remote node. + - Perform appropriate operation taking in account order of operations: + result = value (op) result */ - for (distance = 0x1; distance < adjsize; distance <<=1) { - if (newrank < 0) break; - /* Determine remote node */ - newremote = newrank ^ distance; - remote = (newremote < extra_ranks)? - (newremote * 2 + 1):(newremote + extra_ranks); + for (distance = 0x1; distance < adjsize; distance <<=1) { + if (newrank < 0) break; + /* Determine remote node */ + newremote = newrank ^ distance; + remote = (newremote < extra_ranks)? + (newremote * 2 + 1):(newremote + extra_ranks); - /* Exchange the data */ - ret = MCA_PML_CALL(irecv(tmprecv, count, dtype, remote, - MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[0])); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote, - MCA_COLL_BASE_TAG_ALLREDUCE, - MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1])); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_request_wait_all(2, reqs, MPI_STATUSES_IGNORE); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - - /* Apply operation */ - if (rank < remote) { - /* tmprecv = tmpsend (op) tmprecv */ - ompi_op_reduce(op, tmpsend, tmprecv, count, dtype); - tmpswap = tmprecv; - tmprecv = tmpsend; - tmpsend = tmpswap; - } else { - /* tmpsend = tmprecv (op) tmpsend */ - ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); - } - } - - /* Handle non-power-of-two case: - - Odd ranks less than 2 * extra_ranks send result from tmpsend to - (rank - 1) - - Even ranks less than 2 * extra_ranks receive result from (rank + 1) - */ - if (rank < (2 * extra_ranks)) { - if (0 == (rank % 2)) { - ret = MCA_PML_CALL(recv(rbuf, count, dtype, (rank + 1), - MCA_COLL_BASE_TAG_ALLREDUCE, comm, - MPI_STATUS_IGNORE)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - tmpsend = (char*)rbuf; - } else { - ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank - 1), + /* Exchange the data */ + ret = MCA_PML_CALL(irecv(tmprecv, count, dtype, remote, + MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[0])); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + ret = MCA_PML_CALL(isend(tmpsend, count, dtype, remote, MCA_COLL_BASE_TAG_ALLREDUCE, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - } - } + MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1])); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + ret = ompi_request_wait_all(2, reqs, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Ensure that the final result is in rbuf */ - if (tmpsend != rbuf) { - ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, tmpsend); - if (ret < 0) { line = __LINE__; goto error_hndl; } - } + /* Apply operation */ + if (rank < remote) { + /* tmprecv = tmpsend (op) tmprecv */ + ompi_op_reduce(op, tmpsend, tmprecv, count, dtype); + tmpswap = tmprecv; + tmprecv = tmpsend; + tmpsend = tmpswap; + } else { + /* tmpsend = tmprecv (op) tmpsend */ + ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); + } + } - if (NULL != inplacebuf) free(inplacebuf); - return MPI_SUCCESS; + /* Handle non-power-of-two case: + - Odd ranks less than 2 * extra_ranks send result from tmpsend to + (rank - 1) + - Even ranks less than 2 * extra_ranks receive result from (rank + 1) + */ + if (rank < (2 * extra_ranks)) { + if (0 == (rank % 2)) { + ret = MCA_PML_CALL(recv(rbuf, count, dtype, (rank + 1), + MCA_COLL_BASE_TAG_ALLREDUCE, comm, + MPI_STATUS_IGNORE)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + tmpsend = (char*)rbuf; + } else { + ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank - 1), + MCA_COLL_BASE_TAG_ALLREDUCE, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + } + } + + /* Ensure that the final result is in rbuf */ + if (tmpsend != rbuf) { + ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, tmpsend); + if (ret < 0) { line = __LINE__; goto error_hndl; } + } + + if (NULL != inplacebuf) free(inplacebuf); + return MPI_SUCCESS; error_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", - __FILE__, line, rank, ret)); - if (NULL != inplacebuf) free(inplacebuf); - return ret; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", + __FILE__, line, rank, ret)); + if (NULL != inplacebuf) free(inplacebuf); + return ret; } /* @@ -343,198 +341,195 @@ ompi_coll_tuned_allreduce_intra_ring(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int ret, line; - int rank, size, k, recv_from, send_to; - int early_segcount, late_segcount, split_rank, max_segcount; - int block_count, inbi; - size_t typelng; - char *tmpsend = NULL, *tmprecv = NULL; - char *inbuf[2] = {NULL, NULL}; - ptrdiff_t true_lb, true_extent, lb, extent; - ptrdiff_t block_offset, max_real_segsize; - ompi_request_t *reqs[2] = {NULL, NULL}; + int ret, line, rank, size, k, recv_from, send_to, block_count, inbi; + int early_segcount, late_segcount, split_rank, max_segcount; + size_t typelng; + char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL}; + ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t block_offset, max_real_segsize; + ompi_request_t *reqs[2] = {NULL, NULL}; - size = ompi_comm_size(comm); - rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allreduce_intra_ring rank %d, count %d", rank, count)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allreduce_intra_ring rank %d, count %d", rank, count)); - /* Special case for size == 1 */ - if (1 == size) { - if (MPI_IN_PLACE != sbuf) { - ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); - if (ret < 0) { line = __LINE__; goto error_hndl; } - } - return MPI_SUCCESS; - } + /* Special case for size == 1 */ + if (1 == size) { + if (MPI_IN_PLACE != sbuf) { + ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); + if (ret < 0) { line = __LINE__; goto error_hndl; } + } + return MPI_SUCCESS; + } - /* Special case for count less than size - use recursive doubling */ - if (count < size) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count)); - return (ompi_coll_tuned_allreduce_intra_recursivedoubling(sbuf, rbuf, - count, - dtype, op, - comm, module)); - } + /* Special case for count less than size - use recursive doubling */ + if (count < size) { + OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring rank %d/%d, count %d, switching to recursive doubling", rank, size, count)); + return (ompi_coll_tuned_allreduce_intra_recursivedoubling(sbuf, rbuf, + count, + dtype, op, + comm, module)); + } - /* Allocate and initialize temporary buffers */ - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_type_size( dtype, &typelng); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Allocate and initialize temporary buffers */ + ret = ompi_datatype_get_extent(dtype, &lb, &extent); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + ret = ompi_datatype_type_size( dtype, &typelng); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Determine the number of elements per block and corresponding - block sizes. - The blocks are divided into "early" and "late" ones: - blocks 0 .. (split_rank - 1) are "early" and - blocks (split_rank) .. (size - 1) are "late". - Early blocks are at most 1 element larger than the late ones. + /* Determine the number of elements per block and corresponding + block sizes. + The blocks are divided into "early" and "late" ones: + blocks 0 .. (split_rank - 1) are "early" and + blocks (split_rank) .. (size - 1) are "late". + Early blocks are at most 1 element larger than the late ones. */ - COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, - early_segcount, late_segcount ) - max_segcount = early_segcount; - max_real_segsize = true_extent + (max_segcount - 1) * extent; + COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, + early_segcount, late_segcount ) + max_segcount = early_segcount; + max_real_segsize = true_extent + (max_segcount - 1) * extent; - inbuf[0] = (char*)malloc(max_real_segsize); - if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } - if (size > 2) { - inbuf[1] = (char*)malloc(max_real_segsize); - if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } - } + inbuf[0] = (char*)malloc(max_real_segsize); + if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } + if (size > 2) { + inbuf[1] = (char*)malloc(max_real_segsize); + if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } + } - /* Handle MPI_IN_PLACE */ - if (MPI_IN_PLACE != sbuf) { - ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); - if (ret < 0) { line = __LINE__; goto error_hndl; } - } + /* Handle MPI_IN_PLACE */ + if (MPI_IN_PLACE != sbuf) { + ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); + if (ret < 0) { line = __LINE__; goto error_hndl; } + } - /* Computation loop */ + /* Computation loop */ - /* - For each of the remote nodes: - - post irecv for block (r-1) - - send block (r) - - in loop for every step k = 2 .. n - - post irecv for block (r + n - k) % n - - wait on block (r + n - k + 1) % n to arrive - - compute on block (r + n - k + 1) % n - - send block (r + n - k + 1) % n - - wait on block (r + 1) - - compute on block (r + 1) - - send block (r + 1) to rank (r + 1) - Note that we must be careful when computing the begining of buffers and - for send operations and computation we must compute the exact block size. + /* + For each of the remote nodes: + - post irecv for block (r-1) + - send block (r) + - in loop for every step k = 2 .. n + - post irecv for block (r + n - k) % n + - wait on block (r + n - k + 1) % n to arrive + - compute on block (r + n - k + 1) % n + - send block (r + n - k + 1) % n + - wait on block (r + 1) + - compute on block (r + 1) + - send block (r + 1) to rank (r + 1) + Note that we must be careful when computing the begining of buffers and + for send operations and computation we must compute the exact block size. */ - send_to = (rank + 1) % size; - recv_from = (rank + size - 1) % size; + send_to = (rank + 1) % size; + recv_from = (rank + size - 1) % size; - inbi = 0; - /* Initialize first receive from the neighbor on the left */ - ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, - MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi])); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Send first block (my block) to the neighbor on the right */ - block_offset = ((rank < split_rank)? - ((ptrdiff_t)rank * (ptrdiff_t)early_segcount) : - ((ptrdiff_t)rank * (ptrdiff_t)late_segcount + split_rank)); - block_count = ((rank < split_rank)? early_segcount : late_segcount); - tmpsend = ((char*)rbuf) + block_offset * extent; - ret = MCA_PML_CALL(send(tmpsend, block_count, dtype, send_to, - MCA_COLL_BASE_TAG_ALLREDUCE, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + inbi = 0; + /* Initialize first receive from the neighbor on the left */ + ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, + MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi])); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Send first block (my block) to the neighbor on the right */ + block_offset = ((rank < split_rank)? + ((ptrdiff_t)rank * (ptrdiff_t)early_segcount) : + ((ptrdiff_t)rank * (ptrdiff_t)late_segcount + split_rank)); + block_count = ((rank < split_rank)? early_segcount : late_segcount); + tmpsend = ((char*)rbuf) + block_offset * extent; + ret = MCA_PML_CALL(send(tmpsend, block_count, dtype, send_to, + MCA_COLL_BASE_TAG_ALLREDUCE, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - for (k = 2; k < size; k++) { - const int prevblock = (rank + size - k + 1) % size; + for (k = 2; k < size; k++) { + const int prevblock = (rank + size - k + 1) % size; - inbi = inbi ^ 0x1; + inbi = inbi ^ 0x1; - /* Post irecv for the current block */ - ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, - MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi])); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Post irecv for the current block */ + ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, + MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi])); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Wait on previous block to arrive */ - ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Wait on previous block to arrive */ + ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Apply operation on previous block: result goes to rbuf - rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] - */ - block_offset = ((prevblock < split_rank)? - ((ptrdiff_t)prevblock * early_segcount) : - ((ptrdiff_t)prevblock * late_segcount + split_rank)); - block_count = ((prevblock < split_rank)? early_segcount : late_segcount); - tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent; - ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype); + /* Apply operation on previous block: result goes to rbuf + rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] + */ + block_offset = ((prevblock < split_rank)? + ((ptrdiff_t)prevblock * early_segcount) : + ((ptrdiff_t)prevblock * late_segcount + split_rank)); + block_count = ((prevblock < split_rank)? early_segcount : late_segcount); + tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent; + ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype); - /* send previous block to send_to */ - ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to, - MCA_COLL_BASE_TAG_ALLREDUCE, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - } + /* send previous block to send_to */ + ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to, + MCA_COLL_BASE_TAG_ALLREDUCE, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + } - /* Wait on the last block to arrive */ - ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Wait on the last block to arrive */ + ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Apply operation on the last block (from neighbor (rank + 1) - rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ - recv_from = (rank + 1) % size; - block_offset = ((recv_from < split_rank)? - ((ptrdiff_t)recv_from * early_segcount) : - ((ptrdiff_t)recv_from * late_segcount + split_rank)); - block_count = ((recv_from < split_rank)? early_segcount : late_segcount); - tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent; - ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype); + /* Apply operation on the last block (from neighbor (rank + 1) + rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ + recv_from = (rank + 1) % size; + block_offset = ((recv_from < split_rank)? + ((ptrdiff_t)recv_from * early_segcount) : + ((ptrdiff_t)recv_from * late_segcount + split_rank)); + block_count = ((recv_from < split_rank)? early_segcount : late_segcount); + tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent; + ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype); - /* Distribution loop - variation of ring allgather */ - send_to = (rank + 1) % size; - recv_from = (rank + size - 1) % size; - for (k = 0; k < size - 1; k++) { - const int recv_data_from = (rank + size - k) % size; - const int send_data_from = (rank + 1 + size - k) % size; - const int send_block_offset = - ((send_data_from < split_rank)? - ((ptrdiff_t)send_data_from * early_segcount) : - ((ptrdiff_t)send_data_from * late_segcount + split_rank)); - const int recv_block_offset = - ((recv_data_from < split_rank)? - ((ptrdiff_t)recv_data_from * early_segcount) : - ((ptrdiff_t)recv_data_from * late_segcount + split_rank)); - block_count = ((send_data_from < split_rank)? - early_segcount : late_segcount); + /* Distribution loop - variation of ring allgather */ + send_to = (rank + 1) % size; + recv_from = (rank + size - 1) % size; + for (k = 0; k < size - 1; k++) { + const int recv_data_from = (rank + size - k) % size; + const int send_data_from = (rank + 1 + size - k) % size; + const int send_block_offset = + ((send_data_from < split_rank)? + ((ptrdiff_t)send_data_from * early_segcount) : + ((ptrdiff_t)send_data_from * late_segcount + split_rank)); + const int recv_block_offset = + ((recv_data_from < split_rank)? + ((ptrdiff_t)recv_data_from * early_segcount) : + ((ptrdiff_t)recv_data_from * late_segcount + split_rank)); + block_count = ((send_data_from < split_rank)? + early_segcount : late_segcount); - tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent; - tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent; + tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent; + tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent; - ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to, - MCA_COLL_BASE_TAG_ALLREDUCE, - tmprecv, max_segcount, dtype, recv_from, - MCA_COLL_BASE_TAG_ALLREDUCE, - comm, MPI_STATUS_IGNORE, rank); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl;} + ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to, + MCA_COLL_BASE_TAG_ALLREDUCE, + tmprecv, max_segcount, dtype, recv_from, + MCA_COLL_BASE_TAG_ALLREDUCE, + comm, MPI_STATUS_IGNORE, rank); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl;} - } + } - if (NULL != inbuf[0]) free(inbuf[0]); - if (NULL != inbuf[1]) free(inbuf[1]); + if (NULL != inbuf[0]) free(inbuf[0]); + if (NULL != inbuf[1]) free(inbuf[1]); - return MPI_SUCCESS; + return MPI_SUCCESS; error_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", - __FILE__, line, rank, ret)); - if (NULL != inbuf[0]) free(inbuf[0]); - if (NULL != inbuf[1]) free(inbuf[1]); - return ret; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", + __FILE__, line, rank, ret)); + if (NULL != inbuf[0]) free(inbuf[0]); + if (NULL != inbuf[1]) free(inbuf[1]); + return ret; } /* @@ -621,243 +616,239 @@ ompi_coll_tuned_allreduce_intra_ring_segmented(void *sbuf, void *rbuf, int count struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, uint32_t segsize) { - int ret, line; - int rank, size, k, recv_from, send_to; - int early_blockcount, late_blockcount, split_rank; - int segcount, max_segcount; - int num_phases, phase; - int block_count, inbi; - size_t typelng; - char *tmpsend = NULL, *tmprecv = NULL; - char *inbuf[2] = {NULL, NULL}; - ptrdiff_t true_lb, true_extent, lb, extent; - ptrdiff_t block_offset, max_real_segsize; - ompi_request_t *reqs[2] = {NULL, NULL}; + int ret, line, rank, size, k, recv_from, send_to; + int early_blockcount, late_blockcount, split_rank; + int segcount, max_segcount, num_phases, phase, block_count, inbi; + size_t typelng; + char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL}; + ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t block_offset, max_real_segsize; + ompi_request_t *reqs[2] = {NULL, NULL}; - size = ompi_comm_size(comm); - rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count)); - /* Special case for size == 1 */ - if (1 == size) { - if (MPI_IN_PLACE != sbuf) { - ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); - if (ret < 0) { line = __LINE__; goto error_hndl; } - } - return MPI_SUCCESS; - } + /* Special case for size == 1 */ + if (1 == size) { + if (MPI_IN_PLACE != sbuf) { + ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); + if (ret < 0) { line = __LINE__; goto error_hndl; } + } + return MPI_SUCCESS; + } - /* Determine segment count based on the suggested segment size */ - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_type_size( dtype, &typelng); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - segcount = count; - COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount) + /* Determine segment count based on the suggested segment size */ + ret = ompi_datatype_get_extent(dtype, &lb, &extent); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + ret = ompi_datatype_type_size( dtype, &typelng); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + segcount = count; + COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount) - /* Special case for count less than size * segcount - use regular ring */ - if (count < (size * segcount)) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count)); - return (ompi_coll_tuned_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, - comm, module)); - } + /* Special case for count less than size * segcount - use regular ring */ + if (count < (size * segcount)) { + OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count)); + return (ompi_coll_tuned_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, + comm, module)); + } - /* Determine the number of phases of the algorithm */ - num_phases = count / (size * segcount); - if ((count % (size * segcount) >= size) && - (count % (size * segcount) > ((size * segcount) / 2))) { - num_phases++; - } + /* Determine the number of phases of the algorithm */ + num_phases = count / (size * segcount); + if ((count % (size * segcount) >= size) && + (count % (size * segcount) > ((size * segcount) / 2))) { + num_phases++; + } - /* Determine the number of elements per block and corresponding - block sizes. - The blocks are divided into "early" and "late" ones: - blocks 0 .. (split_rank - 1) are "early" and - blocks (split_rank) .. (size - 1) are "late". - Early blocks are at most 1 element larger than the late ones. - Note, these blocks will be split into num_phases segments, - out of the largest one will have max_segcount elements. + /* Determine the number of elements per block and corresponding + block sizes. + The blocks are divided into "early" and "late" ones: + blocks 0 .. (split_rank - 1) are "early" and + blocks (split_rank) .. (size - 1) are "late". + Early blocks are at most 1 element larger than the late ones. + Note, these blocks will be split into num_phases segments, + out of the largest one will have max_segcount elements. */ - COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, - early_blockcount, late_blockcount ) - COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, - max_segcount, k) - max_real_segsize = true_extent + (ptrdiff_t)(max_segcount - 1) * extent; + COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, + early_blockcount, late_blockcount ) + COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, + max_segcount, k) + max_real_segsize = true_extent + (ptrdiff_t)(max_segcount - 1) * extent; - /* Allocate and initialize temporary buffers */ - inbuf[0] = (char*)malloc(max_real_segsize); - if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } - if (size > 2) { - inbuf[1] = (char*)malloc(max_real_segsize); - if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } - } + /* Allocate and initialize temporary buffers */ + inbuf[0] = (char*)malloc(max_real_segsize); + if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; } + if (size > 2) { + inbuf[1] = (char*)malloc(max_real_segsize); + if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; } + } - /* Handle MPI_IN_PLACE */ - if (MPI_IN_PLACE != sbuf) { - ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); - if (ret < 0) { line = __LINE__; goto error_hndl; } - } + /* Handle MPI_IN_PLACE */ + if (MPI_IN_PLACE != sbuf) { + ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); + if (ret < 0) { line = __LINE__; goto error_hndl; } + } - /* Computation loop: for each phase, repeat ring allreduce computation loop */ - for (phase = 0; phase < num_phases; phase ++) { - ptrdiff_t phase_offset; - int early_phase_segcount, late_phase_segcount, split_phase, phase_count; + /* Computation loop: for each phase, repeat ring allreduce computation loop */ + for (phase = 0; phase < num_phases; phase ++) { + ptrdiff_t phase_offset; + int early_phase_segcount, late_phase_segcount, split_phase, phase_count; - /* - For each of the remote nodes: - - post irecv for block (r-1) - - send block (r) + /* + For each of the remote nodes: + - post irecv for block (r-1) + - send block (r) To do this, first compute block offset and count, and use block offset to compute phase offset. - - in loop for every step k = 2 .. n + - in loop for every step k = 2 .. n - post irecv for block (r + n - k) % n - wait on block (r + n - k + 1) % n to arrive - compute on block (r + n - k + 1) % n - send block (r + n - k + 1) % n - - wait on block (r + 1) - - compute on block (r + 1) - - send block (r + 1) to rank (r + 1) - Note that we must be careful when computing the begining of buffers and - for send operations and computation we must compute the exact block size. - */ - send_to = (rank + 1) % size; - recv_from = (rank + size - 1) % size; + - wait on block (r + 1) + - compute on block (r + 1) + - send block (r + 1) to rank (r + 1) + Note that we must be careful when computing the begining of buffers and + for send operations and computation we must compute the exact block size. + */ + send_to = (rank + 1) % size; + recv_from = (rank + size - 1) % size; - inbi = 0; - /* Initialize first receive from the neighbor on the left */ - ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, - MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi])); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Send first block (my block) to the neighbor on the right: - - compute my block and phase offset - - send data */ - block_offset = ((rank < split_rank)? - ((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) : - ((ptrdiff_t)rank * (ptrdiff_t)late_blockcount + split_rank)); - block_count = ((rank < split_rank)? early_blockcount : late_blockcount); - COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, - early_phase_segcount, late_phase_segcount) - phase_count = ((phase < split_phase)? - (early_phase_segcount) : (late_phase_segcount)); - phase_offset = ((phase < split_phase)? - ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : - ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); - tmpsend = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; - ret = MCA_PML_CALL(send(tmpsend, phase_count, dtype, send_to, - MCA_COLL_BASE_TAG_ALLREDUCE, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + inbi = 0; + /* Initialize first receive from the neighbor on the left */ + ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, + MCA_COLL_BASE_TAG_ALLREDUCE, comm, &reqs[inbi])); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Send first block (my block) to the neighbor on the right: + - compute my block and phase offset + - send data */ + block_offset = ((rank < split_rank)? + ((ptrdiff_t)rank * (ptrdiff_t)early_blockcount) : + ((ptrdiff_t)rank * (ptrdiff_t)late_blockcount + split_rank)); + block_count = ((rank < split_rank)? early_blockcount : late_blockcount); + COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, + early_phase_segcount, late_phase_segcount) + phase_count = ((phase < split_phase)? + (early_phase_segcount) : (late_phase_segcount)); + phase_offset = ((phase < split_phase)? + ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : + ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); + tmpsend = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; + ret = MCA_PML_CALL(send(tmpsend, phase_count, dtype, send_to, + MCA_COLL_BASE_TAG_ALLREDUCE, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - for (k = 2; k < size; k++) { - const int prevblock = (rank + size - k + 1) % size; + for (k = 2; k < size; k++) { + const int prevblock = (rank + size - k + 1) % size; - inbi = inbi ^ 0x1; + inbi = inbi ^ 0x1; - /* Post irecv for the current block */ - ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, - MCA_COLL_BASE_TAG_ALLREDUCE, comm, - &reqs[inbi])); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Post irecv for the current block */ + ret = MCA_PML_CALL(irecv(inbuf[inbi], max_segcount, dtype, recv_from, + MCA_COLL_BASE_TAG_ALLREDUCE, comm, + &reqs[inbi])); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Wait on previous block to arrive */ - ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Wait on previous block to arrive */ + ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Apply operation on previous block: result goes to rbuf - rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] - */ - block_offset = ((prevblock < split_rank)? - ((ptrdiff_t)prevblock * (ptrdiff_t)early_blockcount) : - ((ptrdiff_t)prevblock * (ptrdiff_t)late_blockcount + split_rank)); - block_count = ((prevblock < split_rank)? - early_blockcount : late_blockcount); - COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, - early_phase_segcount, late_phase_segcount) - phase_count = ((phase < split_phase)? - (early_phase_segcount) : (late_phase_segcount)); - phase_offset = ((phase < split_phase)? - ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : - ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); - tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; - ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype); + /* Apply operation on previous block: result goes to rbuf + rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] + */ + block_offset = ((prevblock < split_rank)? + ((ptrdiff_t)prevblock * (ptrdiff_t)early_blockcount) : + ((ptrdiff_t)prevblock * (ptrdiff_t)late_blockcount + split_rank)); + block_count = ((prevblock < split_rank)? + early_blockcount : late_blockcount); + COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, + early_phase_segcount, late_phase_segcount) + phase_count = ((phase < split_phase)? + (early_phase_segcount) : (late_phase_segcount)); + phase_offset = ((phase < split_phase)? + ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : + ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); + tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; + ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype); - /* send previous block to send_to */ - ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to, - MCA_COLL_BASE_TAG_ALLREDUCE, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - } + /* send previous block to send_to */ + ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to, + MCA_COLL_BASE_TAG_ALLREDUCE, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + } - /* Wait on the last block to arrive */ - ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Wait on the last block to arrive */ + ret = ompi_request_wait(&reqs[inbi], MPI_STATUS_IGNORE); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Apply operation on the last block (from neighbor (rank + 1) - rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ - recv_from = (rank + 1) % size; - block_offset = ((recv_from < split_rank)? - ((ptrdiff_t)recv_from * (ptrdiff_t)early_blockcount) : - ((ptrdiff_t)recv_from * (ptrdiff_t)late_blockcount + split_rank)); - block_count = ((recv_from < split_rank)? - early_blockcount : late_blockcount); - COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, - early_phase_segcount, late_phase_segcount) - phase_count = ((phase < split_phase)? - (early_phase_segcount) : (late_phase_segcount)); - phase_offset = ((phase < split_phase)? - ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : - ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); - tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; - ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype); - } + /* Apply operation on the last block (from neighbor (rank + 1) + rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */ + recv_from = (rank + 1) % size; + block_offset = ((recv_from < split_rank)? + ((ptrdiff_t)recv_from * (ptrdiff_t)early_blockcount) : + ((ptrdiff_t)recv_from * (ptrdiff_t)late_blockcount + split_rank)); + block_count = ((recv_from < split_rank)? + early_blockcount : late_blockcount); + COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, + early_phase_segcount, late_phase_segcount) + phase_count = ((phase < split_phase)? + (early_phase_segcount) : (late_phase_segcount)); + phase_offset = ((phase < split_phase)? + ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : + ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); + tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent; + ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype); + } - /* Distribution loop - variation of ring allgather */ - send_to = (rank + 1) % size; - recv_from = (rank + size - 1) % size; - for (k = 0; k < size - 1; k++) { - const int recv_data_from = (rank + size - k) % size; - const int send_data_from = (rank + 1 + size - k) % size; - const int send_block_offset = - ((send_data_from < split_rank)? - ((ptrdiff_t)send_data_from * (ptrdiff_t)early_blockcount) : - ((ptrdiff_t)send_data_from * (ptrdiff_t)late_blockcount + split_rank)); - const int recv_block_offset = - ((recv_data_from < split_rank)? - ((ptrdiff_t)recv_data_from * (ptrdiff_t)early_blockcount) : - ((ptrdiff_t)recv_data_from * (ptrdiff_t)late_blockcount + split_rank)); - block_count = ((send_data_from < split_rank)? - early_blockcount : late_blockcount); + /* Distribution loop - variation of ring allgather */ + send_to = (rank + 1) % size; + recv_from = (rank + size - 1) % size; + for (k = 0; k < size - 1; k++) { + const int recv_data_from = (rank + size - k) % size; + const int send_data_from = (rank + 1 + size - k) % size; + const int send_block_offset = + ((send_data_from < split_rank)? + ((ptrdiff_t)send_data_from * (ptrdiff_t)early_blockcount) : + ((ptrdiff_t)send_data_from * (ptrdiff_t)late_blockcount + split_rank)); + const int recv_block_offset = + ((recv_data_from < split_rank)? + ((ptrdiff_t)recv_data_from * (ptrdiff_t)early_blockcount) : + ((ptrdiff_t)recv_data_from * (ptrdiff_t)late_blockcount + split_rank)); + block_count = ((send_data_from < split_rank)? + early_blockcount : late_blockcount); - tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent; - tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent; + tmprecv = (char*)rbuf + (ptrdiff_t)recv_block_offset * extent; + tmpsend = (char*)rbuf + (ptrdiff_t)send_block_offset * extent; - ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to, - MCA_COLL_BASE_TAG_ALLREDUCE, - tmprecv, early_blockcount, dtype, recv_from, - MCA_COLL_BASE_TAG_ALLREDUCE, - comm, MPI_STATUS_IGNORE, rank); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl;} + ret = ompi_coll_tuned_sendrecv(tmpsend, block_count, dtype, send_to, + MCA_COLL_BASE_TAG_ALLREDUCE, + tmprecv, early_blockcount, dtype, recv_from, + MCA_COLL_BASE_TAG_ALLREDUCE, + comm, MPI_STATUS_IGNORE, rank); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl;} - } + } - if (NULL != inbuf[0]) free(inbuf[0]); - if (NULL != inbuf[1]) free(inbuf[1]); + if (NULL != inbuf[0]) free(inbuf[0]); + if (NULL != inbuf[1]) free(inbuf[1]); - return MPI_SUCCESS; + return MPI_SUCCESS; error_hndl: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", - __FILE__, line, rank, ret)); - if (NULL != inbuf[0]) free(inbuf[0]); - if (NULL != inbuf[1]) free(inbuf[1]); - return ret; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", + __FILE__, line, rank, ret)); + if (NULL != inbuf[0]) free(inbuf[0]); + if (NULL != inbuf[1]) free(inbuf[1]); + return ret; } /* @@ -887,10 +878,9 @@ ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int err; - int rank; + int err, rank; rank = ompi_comm_rank(comm); @@ -901,14 +891,14 @@ ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count, if (MPI_IN_PLACE == sbuf) { if (0 == rank) { err = ompi_coll_tuned_reduce_intra_basic_linear (MPI_IN_PLACE, rbuf, count, dtype, - op, 0, comm, module); + op, 0, comm, module); } else { err = ompi_coll_tuned_reduce_intra_basic_linear(rbuf, NULL, count, dtype, - op, 0, comm, module); + op, 0, comm, module); } } else { err = ompi_coll_tuned_reduce_intra_basic_linear(sbuf, rbuf, count, dtype, - op, 0, comm, module); + op, 0, comm, module); } if (MPI_SUCCESS != err) { return err; @@ -986,7 +976,7 @@ int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; @@ -1016,7 +1006,7 @@ int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d", diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall.c b/ompi/mca/coll/tuned/coll_tuned_alltoall.c index ef1baead79..974ca2371e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall.c @@ -34,11 +34,9 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1, err = 0; - int rank, size, step; - int sendto, recvfrom; + int line = -1, err = 0, rank, size, step, sendto, recvfrom; void * tmpsend, *tmprecv; ptrdiff_t lb, sext, rext; @@ -89,15 +87,12 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int i, k, line = -1; - int rank, size; + int i, k, line = -1, rank, size, err = 0, weallocated = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; ptrdiff_t rlb, slb, tlb, sext, rext, tsext; - int err = 0; - int weallocated = 0; struct ompi_datatype_t *new_ddt; #ifdef blahblah mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; @@ -147,17 +142,17 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, - (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), - tmpbuf, - ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); + (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), + tmpbuf, + ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } if (rank != 0) { err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount, - tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, - (char*) sbuf); + tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, + (char*) sbuf); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } @@ -206,8 +201,8 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, for (i = 0; i < size; i++) { err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount, - ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), - tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); + ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), + tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } } @@ -253,18 +248,12 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, int max_outstanding_reqs) { - int line, error; - int ri, si; - int rank; - int size; - int nreqs, nrreqs, nsreqs, total_reqs; - char *psnd; - char *prcv; - ptrdiff_t slb, sext; - ptrdiff_t rlb, rext; + int line, error, ri, si, rank, size, nreqs, nrreqs, nsreqs, total_reqs; + char *psnd, *prcv; + ptrdiff_t slb, sext, rlb, rext; ompi_request_t **reqs = NULL; @@ -318,67 +307,67 @@ int ompi_coll_tuned_alltoall_intra_linear_sync(void *sbuf, int scount, /* Post first batch or ireceive and isend requests */ for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs; ri = (ri + 1) % size, ++nreqs, ++nrreqs) { - error = - MCA_PML_CALL(irecv - (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, - MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs])); - if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } + error = + MCA_PML_CALL(irecv + (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, + MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs])); + if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs; si = (si + size - 1) % size, ++nreqs, ++nsreqs) { - error = - MCA_PML_CALL(isend - (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, - MCA_COLL_BASE_TAG_ALLTOALL, - MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs])); - if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } + error = + MCA_PML_CALL(isend + (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, + MCA_COLL_BASE_TAG_ALLTOALL, + MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs])); + if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } /* Wait for requests to complete */ if (nreqs == 2 * (size - 1)) { - /* Optimization for the case when all requests have been posted */ - error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); - if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } + /* Optimization for the case when all requests have been posted */ + error = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } } else { - /* As requests complete, replace them with corresponding requests: - - wait for any request to complete, mark the request as - MPI_REQUEST_NULL - - If it was a receive request, replace it with new irecv request - (if any) - - if it was a send request, replace it with new isend request (if any) - */ - int ncreqs = 0; - while (ncreqs < 2 * (size - 1)) { - int completed; - error = ompi_request_wait_any(2 * total_reqs, reqs, &completed, - MPI_STATUS_IGNORE); - if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } - reqs[completed] = MPI_REQUEST_NULL; - ncreqs++; - if (completed < total_reqs) { - if (nrreqs < (size - 1)) { - error = - MCA_PML_CALL(irecv - (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, - MCA_COLL_BASE_TAG_ALLTOALL, comm, - &reqs[completed])); - if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } - ++nrreqs; - ri = (ri + 1) % size; - } - } else { - if (nsreqs < (size - 1)) { - error = MCA_PML_CALL(isend - (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, - MCA_COLL_BASE_TAG_ALLTOALL, - MCA_PML_BASE_SEND_STANDARD, comm, + /* As requests complete, replace them with corresponding requests: + - wait for any request to complete, mark the request as + MPI_REQUEST_NULL + - If it was a receive request, replace it with new irecv request + (if any) + - if it was a send request, replace it with new isend request (if any) + */ + int ncreqs = 0; + while (ncreqs < 2 * (size - 1)) { + int completed; + error = ompi_request_wait_any(2 * total_reqs, reqs, &completed, + MPI_STATUS_IGNORE); + if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } + reqs[completed] = MPI_REQUEST_NULL; + ncreqs++; + if (completed < total_reqs) { + if (nrreqs < (size - 1)) { + error = + MCA_PML_CALL(irecv + (prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri, + MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[completed])); - ++nsreqs; - si = (si + size - 1) % size; - } - } - } + if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; } + ++nrreqs; + ri = (ri + 1) % size; + } + } else { + if (nsreqs < (size - 1)) { + error = MCA_PML_CALL(isend + (psnd + (ptrdiff_t)si * sext, scount, sdtype, si, + MCA_COLL_BASE_TAG_ALLTOALL, + MCA_PML_BASE_SEND_STANDARD, comm, + &reqs[completed])); + ++nsreqs; + si = (si + size - 1) % size; + } + } + } } /* Free the reqs */ @@ -401,11 +390,9 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int line = -1, err = 0; - int rank; - int remote; + int line = -1, err = 0, rank, remote; void * tmpsend, *tmprecv; ptrdiff_t sext, rext, lb; @@ -436,9 +423,9 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount, /* ddt sendrecv your own data */ err = ompi_datatype_sndrcv((char*) sbuf + (ptrdiff_t)rank * sext * (ptrdiff_t)scount, - (int32_t) scount, sdtype, - (char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount, - (int32_t) rcount, rdtype); + (int32_t) scount, sdtype, + (char*) rbuf + (ptrdiff_t)rank * rext * (ptrdiff_t)rcount, + (int32_t) rcount, rdtype); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* done */ @@ -474,21 +461,10 @@ int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int i; - int rank; - int size; - int err; - int nreqs; - char *psnd; - char *prcv; - MPI_Aint lb; - MPI_Aint sndinc; - MPI_Aint rcvinc; - - ompi_request_t **req; - ompi_request_t **sreq; - ompi_request_t **rreq; - + int i, rank, size, err, nreqs; + char *psnd, *prcv; + MPI_Aint lb, sndinc, rcvinc; + ompi_request_t **req, **sreq, **rreq; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; @@ -653,12 +629,12 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm NULL); mca_param_indices->max_requests_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "alltoall_algorithm_max_requests", - "Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.", - false, false, - ompi_coll_tuned_init_max_requests, /* get system wide default */ - NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "alltoall_algorithm_max_requests", + "Maximum number of outstanding send or recv requests. Only has meaning for synchronized algorithms.", + false, false, + ompi_coll_tuned_init_max_requests, /* get system wide default */ + NULL); if (mca_param_indices->max_requests_param_index < 0) { return mca_param_indices->algorithm_param_index; } @@ -682,7 +658,7 @@ int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; @@ -711,7 +687,7 @@ int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, int algorithm, int faninout, int segsize, int max_requests) { diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoallv.c b/ompi/mca/coll/tuned/coll_tuned_alltoallv.c index 0ab2ff5ff2..8324cab122 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoallv.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoallv.c @@ -38,9 +38,7 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int line = -1, err = 0; - int rank, size, step; - int sendto, recvfrom; + int line = -1, err = 0, rank, size, step, sendto, recvfrom; void *psnd, *prcv; ptrdiff_t sext, rext; @@ -58,7 +56,7 @@ ompi_coll_tuned_alltoallv_intra_pairwise(void *sbuf, int *scounts, int *sdisps, if (0 != scounts[rank]) { err = ompi_datatype_sndrcv(psnd, scounts[rank], sdtype, - prcv, rcounts[rank], rdtype); + prcv, rcounts[rank], rdtype); if (MPI_SUCCESS != err) { return err; } @@ -115,9 +113,8 @@ ompi_coll_tuned_alltoallv_intra_basic_linear(void *sbuf, int *scounts, int *sdis struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int i, size, rank, err; + int i, size, rank, err, nreqs; char *psnd, *prcv; - int nreqs; ptrdiff_t sext, rext; MPI_Request *preq; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; @@ -305,16 +302,16 @@ int ompi_coll_tuned_alltoallv_intra_do_this(void *sbuf, int *scounts, int *sdisp switch (algorithm) { case (0): return ompi_coll_tuned_alltoallv_intra_dec_fixed(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, module); + rbuf, rcounts, rdisps, rdtype, + comm, module); case (1): return ompi_coll_tuned_alltoallv_intra_basic_linear(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, module); + rbuf, rcounts, rdisps, rdtype, + comm, module); case (2): return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype, - rbuf, rcounts, rdisps, rdtype, - comm, module); + rbuf, rcounts, rdisps, rdtype, + comm, module); default: OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:alltoall_intra_do_this attempt to select " diff --git a/ompi/mca/coll/tuned/coll_tuned_barrier.c b/ompi/mca/coll/tuned/coll_tuned_barrier.c index a2cf36d477..d660c839f1 100644 --- a/ompi/mca/coll/tuned/coll_tuned_barrier.c +++ b/ompi/mca/coll/tuned/coll_tuned_barrier.c @@ -50,12 +50,9 @@ * */ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int rank, size; - int err=0, line=0; - int left, right; - + int rank, size, err = 0, line = 0, left, right; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); @@ -122,11 +119,9 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm, */ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int rank, size, adjsize; - int err, line; - int mask, remote; + int rank, size, adjsize, err, line, mask, remote; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); @@ -206,11 +201,9 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t * */ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int rank, size; - int distance, to, from; - int err, line = 0; + int rank, size, distance, to, from, err, line = 0; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); @@ -245,7 +238,7 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm, */ /* special case for two processes */ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { int remote, err; @@ -278,14 +271,14 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm, /* copied function (with appropriate renaming) starts here */ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int i, err; - int size = ompi_comm_size(comm); - int rank = ompi_comm_rank(comm); + int i, err, rank, size; + + rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); /* All non-root send & receive zero-length message. */ - if (rank > 0) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, @@ -345,8 +338,7 @@ static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int rank, size, depth; - int err, jump, partner; + int rank, size, depth, err, jump, partner; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); @@ -376,7 +368,7 @@ int ompi_coll_tuned_barrier_intra_tree(struct ompi_communicator_t *comm, } } - depth>>=1; + depth >>= 1; for (jump = depth; jump>0; jump>>=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { @@ -423,10 +415,10 @@ int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_ false, true, max_alg, NULL); mca_param_indices->algorithm_param_index = - mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "barrier_algorithm", - "Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree", - false, false, 0, NULL); + mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "barrier_algorithm", + "Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only, 6: tree", + false, false, 0, NULL); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; } diff --git a/ompi/mca/coll/tuned/coll_tuned_bcast.c b/ompi/mca/coll/tuned/coll_tuned_bcast.c index 2880a50adb..0ad5cd54ec 100644 --- a/ompi/mca/coll/tuned/coll_tuned_bcast.c +++ b/ompi/mca/coll/tuned/coll_tuned_bcast.c @@ -35,24 +35,20 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, uint32_t count_by_segment, ompi_coll_tree_t* tree ) { - int err = 0, line, i; - int rank, size; - int segindex; + int err = 0, line, i, rank, size, segindex, req_index; int num_segments; /* Number of segments */ int sendcount; /* number of elements sent in this segment */ - size_t realsegsize; + size_t realsegsize, type_size; char *tmpbuf; - size_t type_size; ptrdiff_t extent, lb; ompi_request_t *recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; #if !defined(COLL_TUNED_BCAST_USE_BLOCKING) ompi_request_t **send_reqs = NULL; #endif - int req_index; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -78,7 +74,7 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, /* For each segment: - send segment to all children. - The last segment may have less elements than other segments. + The last segment may have less elements than other segments. */ sendcount = count_by_segment; for( segindex = 0; segindex < num_segments; segindex++ ) { @@ -120,13 +116,13 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, Create the pipeline. 1) Post the first receive 2) For segments 1 .. num_segments - - post new receive - - wait on the previous receive to complete - - send this data to children + - post new receive + - wait on the previous receive to complete + - send this data to children 3) Wait on the last segment 4) Compute number of elements in last segment. 5) Send the last segment to children - */ + */ req_index = 0; MCA_PML_CALL(irecv(tmpbuf, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, @@ -210,8 +206,8 @@ ompi_coll_tuned_bcast_intra_generic( void* buffer, Receive all segments from parent in a loop: 1) post irecv for the first segment 2) for segments 1 .. num_segments - - post irecv for the next segment - - wait on the previous segment to arrive + - post irecv for the next segment + - wait on the previous segment to arrive 3) wait for the last segment */ req_index = 0; @@ -259,7 +255,7 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer, struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, uint32_t segsize ) { int segcount = count; @@ -288,7 +284,7 @@ ompi_coll_tuned_bcast_intra_pipeline( void* buffer, struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, uint32_t segsize ) { int segcount = count; @@ -317,7 +313,7 @@ ompi_coll_tuned_bcast_intra_chain( void* buffer, struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, uint32_t segsize, int32_t chains ) { int segcount = count; @@ -346,7 +342,7 @@ ompi_coll_tuned_bcast_intra_binomial( void* buffer, struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, uint32_t segsize ) { int segcount = count; @@ -375,19 +371,16 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, struct ompi_datatype_t* datatype, int root, struct ompi_communicator_t* comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, uint32_t segsize ) { - int err=0, line; - int rank, size; - int segindex, i, lr, pair; - int segcount[2]; /* Number of elements sent with each segment */ + int err=0, line, rank, size, segindex, i, lr, pair; uint32_t counts[2]; + int segcount[2]; /* Number of elements sent with each segment */ int num_segments[2]; /* Number of segmenets */ int sendcount[2]; /* the same like segcount, except for the last segment */ - size_t realsegsize[2]; + size_t realsegsize[2], type_size; char *tmpbuf[2]; - size_t type_size; ptrdiff_t type_extent, lb; ompi_request_t *base_req, *new_req; ompi_coll_tree_t *tree; @@ -438,7 +431,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer, /* call linear version here ! */ return (ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype, root, comm, module, - segsize, 1 )); + segsize, 1 )); } err = ompi_datatype_get_extent (datatype, &lb, &type_extent); @@ -644,16 +637,12 @@ int ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int i; - int size; - int rank; - int err; + int i, size, rank, err; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; - ompi_request_t **preq; - ompi_request_t **reqs = data->mcct_reqs; + ompi_request_t **preq, **reqs = data->mcct_reqs; size = ompi_comm_size(comm); @@ -779,7 +768,7 @@ int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count, struct ompi_datatype_t *dtype, int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; @@ -813,7 +802,7 @@ int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count, struct ompi_datatype_t *dtype, int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c index d351021af1..3380e1dcf1 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c @@ -79,7 +79,7 @@ ompi_coll_msg_rule_t* ompi_coll_tuned_mk_msg_rules (int n_msg_rules, int alg_rul msg_rules = (ompi_coll_msg_rule_t *) calloc (n_msg_rules, sizeof (ompi_coll_msg_rule_t)); if (!msg_rules) return (msg_rules); - for (i=0;in_com_sizes)); - } - else { + } else { /* ok, memory exists for the com rules so free their message rules first */ - for (i=0;in_com_sizes;i++) { + for( i = 0; i < alg_p->n_com_sizes; i++ ) { com_p = &(alg_p->com_rules[i]); ompi_coll_tuned_free_msg_rules_in_com_rule (com_p); } @@ -265,7 +259,7 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs) int i; int rc = 0; - for(i=0;itree_nextsize; i++) { - int mycount = 0, vkid; - /* figure out how much data I have to send to this child */ - vkid = (bmtree->tree_next[i] - root + size) % size; - mycount = vkid - vrank; - if (mycount > (size - vkid)) - mycount = size - vkid; - mycount *= rcount; + /* all non-leaf nodes recv from children */ + for (i = 0; i < bmtree->tree_nextsize; i++) { + int mycount = 0, vkid; + /* figure out how much data I have to send to this child */ + vkid = (bmtree->tree_next[i] - root + size) % size; + mycount = vkid - vrank; + if (mycount > (size - vkid)) + mycount = size - vkid; + mycount *= rcount; - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_gather_intra_binomial rank %d recv %d mycount = %d", - rank, bmtree->tree_next[i], mycount)); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_gather_intra_binomial rank %d recv %d mycount = %d", + rank, bmtree->tree_next[i], mycount)); - err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype, - bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER, - comm, &status)); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype, + bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER, + comm, &status)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - total_recv += mycount; - } + total_recv += mycount; + } } if (rank != root) { - /* all nodes except root send to parents */ - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_gather_intra_binomial rank %d send %d count %d\n", - rank, bmtree->tree_prev, total_recv)); + /* all nodes except root send to parents */ + OPAL_OUTPUT((ompi_coll_tuned_stream, + "ompi_coll_tuned_gather_intra_binomial rank %d send %d count %d\n", + rank, bmtree->tree_prev, total_recv)); - err = MCA_PML_CALL(send(ptmp, total_recv, sdtype, - bmtree->tree_prev, - MCA_COLL_BASE_TAG_GATHER, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = MCA_PML_CALL(send(ptmp, total_recv, sdtype, + bmtree->tree_prev, + MCA_COLL_BASE_TAG_GATHER, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } if (rank == root) { - if (root != 0) { - /* rotate received data on root if root != 0 */ - err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root), - (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + if (root != 0) { + /* rotate received data on root if root != 0 */ + err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root), + (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root, - (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root)); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root, + (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - free(tempbuf); - } + free(tempbuf); + } } else if (!(vrank % 2)) { - /* other non-leaf nodes */ - free(tempbuf); + /* other non-leaf nodes */ + free(tempbuf); } return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) - free(tempbuf); + free(tempbuf); OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", - __FILE__, line, err, rank)); + __FILE__, line, err, rank)); return err; } @@ -213,23 +206,18 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, int first_segment_size) { - int i; - int ret, line; - int rank, size; - int first_segment_count; + int i, ret, line, rank, size, first_segment_count; + MPI_Aint extent, lb; size_t typelng; - MPI_Aint extent; - MPI_Aint lb; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size)); + "ompi_coll_tuned_gather_intra_linear_sync rank %d, segment %d", rank, first_segment_size)); if (rank != root) { /* Non-root processes: @@ -259,18 +247,18 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, root, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } - } - else { + } else { + /* Root process, - For every non-root node: - - post irecv for the first segment of the message - - send zero byte message to signal node to send the message - - post irecv for the second segment of the message - - wait for the first segment to complete + - post irecv for the first segment of the message + - send zero byte message to signal node to send the message + - post irecv for the second segment of the message + - wait for the first segment to complete - Copy local data if necessary - Waitall for all the second segments to complete. - */ + */ char *ptmp; ompi_request_t **reqs = NULL, *first_segment_req; reqs = (ompi_request_t**) calloc(size, sizeof(ompi_request_t*)); @@ -318,8 +306,8 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, /* copy local data if necessary */ if (MPI_IN_PLACE != sbuf) { ret = ompi_datatype_sndrcv(sbuf, scount, sdtype, - (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent, - rcount, rdtype); + (char*)rbuf + (ptrdiff_t)rank * (ptrdiff_t)rcount * extent, + rcount, rdtype); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } @@ -362,28 +350,23 @@ ompi_coll_tuned_gather_intra_linear_sync(void *sbuf, int scount, */ int ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - int i; - int err; - int rank; - int size; + int i, err, rank, size; char *ptmp; - MPI_Aint incr; - MPI_Aint extent; - MPI_Aint lb; + MPI_Aint incr, extent, lb; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* Everyone but root sends data and returns. */ OPAL_OUTPUT((ompi_coll_tuned_stream, - "ompi_coll_tuned_gather_intra_basic_linear rank %d", rank)); + "ompi_coll_tuned_gather_intra_basic_linear rank %d", rank)); if (rank != root) { return MCA_PML_CALL(send(sbuf, scount, sdtype, root, @@ -399,7 +382,7 @@ ompi_coll_tuned_gather_intra_basic_linear(void *sbuf, int scount, if (i == rank) { if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_sndrcv(sbuf, scount, sdtype, - ptmp, rcount, rdtype); + ptmp, rcount, rdtype); } else { err = MPI_SUCCESS; } @@ -489,88 +472,85 @@ ompi_coll_tuned_gather_intra_check_forced_init(coll_tuned_force_algorithm_mca_pa int ompi_coll_tuned_gather_intra_do_forced(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:gather_intra_do_forced selected algorithm %d", - data->user_forced[GATHER].algorithm)); + "coll:tuned:gather_intra_do_forced selected algorithm %d", + data->user_forced[GATHER].algorithm)); switch (data->user_forced[GATHER].algorithm) { case (0): - return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); + return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); case (1): - return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); + return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); case (2): return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); case (3): - { - const int first_segment_size = data->user_forced[GATHER].segsize; return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module, - first_segment_size); - } + data->user_forced[GATHER].segsize); default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[GATHER].algorithm, - ompi_coll_tuned_forced_max_algorithms[GATHER])); - return (MPI_ERR_ARG); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:gather_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", + data->user_forced[GATHER].algorithm, + ompi_coll_tuned_forced_max_algorithms[GATHER])); + return (MPI_ERR_ARG); } /* switch */ } int ompi_coll_tuned_gather_intra_do_this(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module, + int algorithm, int faninout, int segsize) { OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); + "coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d", + algorithm, faninout, segsize)); switch (algorithm) { case (0): - return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); + return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); case (1): - return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); + return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); case (2): - return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); + return ompi_coll_tuned_gather_intra_binomial(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); case (3): - return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, + return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module, - segsize); + segsize); default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, - ompi_coll_tuned_forced_max_algorithms[GATHER])); - return (MPI_ERR_ARG); + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:gather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, + ompi_coll_tuned_forced_max_algorithms[GATHER])); + return (MPI_ERR_ARG); } /* switch */ } diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index 94fd7eb67b..c96ff923ca 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -42,8 +42,6 @@ static int tuned_module_enable(mca_coll_base_module_t *module, int ompi_coll_tuned_init_query(bool enable_progress_threads, bool enable_mpi_threads) { - /* Nothing to do */ - return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce.c b/ompi/mca/coll/tuned/coll_tuned_reduce.c index e81eba2341..66824489cc 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce.c @@ -483,9 +483,7 @@ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, uint32_t segsize, int max_outstanding_reqs ) { - int ret; - int rank, size, io_root; - int segcount = count; + int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; @@ -603,10 +601,8 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, { int i, rank, err, size; ptrdiff_t true_lb, true_extent, lb, extent; - char *free_buffer = NULL; - char *pml_buffer = NULL; - char *inplace_temp = NULL; - char *inbuf; + char *free_buffer = NULL, *pml_buffer = NULL; + char *inplace_temp = NULL, *inbuf; /* Initialize */ diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c index bc247f03c1..1a5470f7f8 100644 --- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c +++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter.c @@ -43,16 +43,11 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int err, i; - int rank, size; - int total_count; - int *displs = NULL; - char *tmprbuf = NULL; - char *tmprbuf_free = NULL; - + int err, i, rank, size, total_count, *displs = NULL; const int root = 0; + char *tmprbuf = NULL, *tmprbuf_free = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); @@ -64,42 +59,42 @@ int ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(void *sbuf, void *rbuf, /* Reduce to rank 0 (root) and scatterv */ tmprbuf = (char*) rbuf; if (MPI_IN_PLACE == sbuf) { - /* rbuf on root (0) is big enough to hold whole data */ - if (root == rank) { - err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count, - dtype, op, root, comm, comm->c_coll.coll_reduce_module); - } else { - err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count, - dtype, op, root, comm, comm->c_coll.coll_reduce_module); - } + /* rbuf on root (0) is big enough to hold whole data */ + if (root == rank) { + err = comm->c_coll.coll_reduce (MPI_IN_PLACE, tmprbuf, total_count, + dtype, op, root, comm, comm->c_coll.coll_reduce_module); + } else { + err = comm->c_coll.coll_reduce(tmprbuf, NULL, total_count, + dtype, op, root, comm, comm->c_coll.coll_reduce_module); + } } else { - if (root == rank) { - /* We must allocate temporary receive buffer on root to ensure that - rbuf is big enough */ - ptrdiff_t lb, extent, tlb, textent; + if (root == rank) { + /* We must allocate temporary receive buffer on root to ensure that + rbuf is big enough */ + ptrdiff_t lb, extent, tlb, textent; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &tlb, &textent); + ompi_datatype_get_extent(dtype, &lb, &extent); + ompi_datatype_get_true_extent(dtype, &tlb, &textent); - tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent); - tmprbuf = tmprbuf_free - lb; - } - err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count, - dtype, op, root, comm, comm->c_coll.coll_reduce_module); + tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent); + tmprbuf = tmprbuf_free - lb; + } + err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count, + dtype, op, root, comm, comm->c_coll.coll_reduce_module); } if (MPI_SUCCESS != err) { - if (NULL != tmprbuf_free) free(tmprbuf_free); - return err; + if (NULL != tmprbuf_free) free(tmprbuf_free); + return err; } displs = (int*) malloc(size * sizeof(int)); displs[0] = 0; for (i = 1; i < size; i++) { - displs[i] = displs[i-1] + rcounts[i-1]; + displs[i] = displs[i-1] + rcounts[i-1]; } err = comm->c_coll.coll_scatterv (tmprbuf, rcounts, displs, dtype, - rbuf, rcounts[rank], dtype, - root, comm, comm->c_coll.coll_scatterv_module); + rbuf, rcounts[rank], dtype, + root, comm, comm->c_coll.coll_scatterv_module); free(displs); if (NULL != tmprbuf_free) free(tmprbuf_free); @@ -130,11 +125,10 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { int i, rank, size, count, err = OMPI_SUCCESS; - int tmp_size, remain = 0, tmp_rank; - int *disps = NULL; + int tmp_size, remain = 0, tmp_rank, *disps = NULL; ptrdiff_t true_lb, true_extent, lb, extent, buf_size; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; @@ -151,14 +145,14 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, disps[0] = 0; for (i = 0; i < (size - 1); ++i) { - disps[i + 1] = disps[i] + rcounts[i]; + disps[i + 1] = disps[i] + rcounts[i]; } count = disps[size - 1] + rcounts[size - 1]; /* short cut the trivial case */ if (0 == count) { - free(disps); - return OMPI_SUCCESS; + free(disps); + return OMPI_SUCCESS; } /* get datatype information */ @@ -168,15 +162,15 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { - sbuf = rbuf; + sbuf = rbuf; } /* Allocate temporary receive buffer. */ recv_buf_free = (char*) malloc(buf_size); recv_buf = recv_buf_free - lb; if (NULL == recv_buf_free) { - err = OMPI_ERR_OUT_OF_RESOURCE; - goto cleanup; + err = OMPI_ERR_OUT_OF_RESOURCE; + goto cleanup; } /* allocate temporary buffer for results */ @@ -198,186 +192,186 @@ ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(void *sbuf, procs with an even rank send to rank + 1, leaving a power of two procs to do the rest of the algorithm */ if (rank < 2 * remain) { - if ((rank & 1) == 0) { - err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, - MCA_PML_BASE_SEND_STANDARD, - comm)); - if (OMPI_SUCCESS != err) goto cleanup; + if ((rank & 1) == 0) { + err = MCA_PML_CALL(send(result_buf, count, dtype, rank + 1, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, + MCA_PML_BASE_SEND_STANDARD, + comm)); + if (OMPI_SUCCESS != err) goto cleanup; - /* we don't participate from here on out */ - tmp_rank = -1; - } else { - err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, - comm, MPI_STATUS_IGNORE)); + /* we don't participate from here on out */ + tmp_rank = -1; + } else { + err = MCA_PML_CALL(recv(recv_buf, count, dtype, rank - 1, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, + comm, MPI_STATUS_IGNORE)); - /* integrate their results into our temp results */ - ompi_op_reduce(op, recv_buf, result_buf, count, dtype); + /* integrate their results into our temp results */ + ompi_op_reduce(op, recv_buf, result_buf, count, dtype); - /* adjust rank to be the bottom "remain" ranks */ - tmp_rank = rank / 2; - } + /* adjust rank to be the bottom "remain" ranks */ + tmp_rank = rank / 2; + } } else { - /* just need to adjust rank to show that the bottom "even - remain" ranks dropped out */ - tmp_rank = rank - remain; + /* just need to adjust rank to show that the bottom "even + remain" ranks dropped out */ + tmp_rank = rank - remain; } /* For ranks not kicked out by the above code, perform the recursive halving */ if (tmp_rank >= 0) { - int *tmp_disps = NULL, *tmp_rcounts = NULL; - int mask, send_index, recv_index, last_index; + int *tmp_disps = NULL, *tmp_rcounts = NULL; + int mask, send_index, recv_index, last_index; - /* recalculate disps and rcounts to account for the - special "remainder" processes that are no longer doing - anything */ - tmp_rcounts = (int*) malloc(tmp_size * sizeof(int)); - if (NULL == tmp_rcounts) { - err = OMPI_ERR_OUT_OF_RESOURCE; - goto cleanup; - } - tmp_disps = (int*) malloc(tmp_size * sizeof(int)); - if (NULL == tmp_disps) { - free(tmp_rcounts); - err = OMPI_ERR_OUT_OF_RESOURCE; - goto cleanup; - } + /* recalculate disps and rcounts to account for the + special "remainder" processes that are no longer doing + anything */ + tmp_rcounts = (int*) malloc(tmp_size * sizeof(int)); + if (NULL == tmp_rcounts) { + err = OMPI_ERR_OUT_OF_RESOURCE; + goto cleanup; + } + tmp_disps = (int*) malloc(tmp_size * sizeof(int)); + if (NULL == tmp_disps) { + free(tmp_rcounts); + err = OMPI_ERR_OUT_OF_RESOURCE; + goto cleanup; + } - for (i = 0 ; i < tmp_size ; ++i) { - if (i < remain) { - /* need to include old neighbor as well */ - tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2]; - } else { - tmp_rcounts[i] = rcounts[i + remain]; - } - } + for (i = 0 ; i < tmp_size ; ++i) { + if (i < remain) { + /* need to include old neighbor as well */ + tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2]; + } else { + tmp_rcounts[i] = rcounts[i + remain]; + } + } - tmp_disps[0] = 0; - for (i = 0; i < tmp_size - 1; ++i) { - tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i]; - } + tmp_disps[0] = 0; + for (i = 0; i < tmp_size - 1; ++i) { + tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i]; + } - /* do the recursive halving communication. Don't use the - dimension information on the communicator because I - think the information is invalidated by our "shrinking" - of the communicator */ - mask = tmp_size >> 1; - send_index = recv_index = 0; - last_index = tmp_size; - while (mask > 0) { - int tmp_peer, peer, send_count, recv_count; - struct ompi_request_t *request; + /* do the recursive halving communication. Don't use the + dimension information on the communicator because I + think the information is invalidated by our "shrinking" + of the communicator */ + mask = tmp_size >> 1; + send_index = recv_index = 0; + last_index = tmp_size; + while (mask > 0) { + int tmp_peer, peer, send_count, recv_count; + struct ompi_request_t *request; - tmp_peer = tmp_rank ^ mask; - peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain; + tmp_peer = tmp_rank ^ mask; + peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain; - /* figure out if we're sending, receiving, or both */ - send_count = recv_count = 0; - if (tmp_rank < tmp_peer) { - send_index = recv_index + mask; - for (i = send_index ; i < last_index ; ++i) { - send_count += tmp_rcounts[i]; - } - for (i = recv_index ; i < send_index ; ++i) { - recv_count += tmp_rcounts[i]; - } - } else { - recv_index = send_index + mask; - for (i = send_index ; i < recv_index ; ++i) { - send_count += tmp_rcounts[i]; - } - for (i = recv_index ; i < last_index ; ++i) { - recv_count += tmp_rcounts[i]; - } - } + /* figure out if we're sending, receiving, or both */ + send_count = recv_count = 0; + if (tmp_rank < tmp_peer) { + send_index = recv_index + mask; + for (i = send_index ; i < last_index ; ++i) { + send_count += tmp_rcounts[i]; + } + for (i = recv_index ; i < send_index ; ++i) { + recv_count += tmp_rcounts[i]; + } + } else { + recv_index = send_index + mask; + for (i = send_index ; i < recv_index ; ++i) { + send_count += tmp_rcounts[i]; + } + for (i = recv_index ; i < last_index ; ++i) { + recv_count += tmp_rcounts[i]; + } + } - /* actual data transfer. Send from result_buf, - receive into recv_buf */ - if (send_count > 0 && recv_count != 0) { - err = MCA_PML_CALL(irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, - recv_count, dtype, peer, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, - comm, &request)); - if (OMPI_SUCCESS != err) { - free(tmp_rcounts); - free(tmp_disps); - goto cleanup; - } - } - if (recv_count > 0 && send_count != 0) { - err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent, - send_count, dtype, peer, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, - MCA_PML_BASE_SEND_STANDARD, - comm)); - if (OMPI_SUCCESS != err) { - free(tmp_rcounts); - free(tmp_disps); - goto cleanup; - } - } - if (send_count > 0 && recv_count != 0) { - err = ompi_request_wait(&request, MPI_STATUS_IGNORE); - if (OMPI_SUCCESS != err) { - free(tmp_rcounts); - free(tmp_disps); - goto cleanup; - } - } + /* actual data transfer. Send from result_buf, + receive into recv_buf */ + if (send_count > 0 && recv_count != 0) { + err = MCA_PML_CALL(irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, + recv_count, dtype, peer, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, + comm, &request)); + if (OMPI_SUCCESS != err) { + free(tmp_rcounts); + free(tmp_disps); + goto cleanup; + } + } + if (recv_count > 0 && send_count != 0) { + err = MCA_PML_CALL(send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent, + send_count, dtype, peer, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, + MCA_PML_BASE_SEND_STANDARD, + comm)); + if (OMPI_SUCCESS != err) { + free(tmp_rcounts); + free(tmp_disps); + goto cleanup; + } + } + if (send_count > 0 && recv_count != 0) { + err = ompi_request_wait(&request, MPI_STATUS_IGNORE); + if (OMPI_SUCCESS != err) { + free(tmp_rcounts); + free(tmp_disps); + goto cleanup; + } + } - /* if we received something on this step, push it into - the results buffer */ - if (recv_count > 0) { - ompi_op_reduce(op, - recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, - result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, - recv_count, dtype); - } + /* if we received something on this step, push it into + the results buffer */ + if (recv_count > 0) { + ompi_op_reduce(op, + recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, + result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, + recv_count, dtype); + } - /* update for next iteration */ - send_index = recv_index; - last_index = recv_index + mask; - mask >>= 1; - } + /* update for next iteration */ + send_index = recv_index; + last_index = recv_index + mask; + mask >>= 1; + } - /* copy local results from results buffer into real receive buffer */ - if (0 != rcounts[rank]) { - err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent, - rcounts[rank], dtype, - rbuf, rcounts[rank], dtype); - if (OMPI_SUCCESS != err) { - free(tmp_rcounts); - free(tmp_disps); - goto cleanup; - } - } + /* copy local results from results buffer into real receive buffer */ + if (0 != rcounts[rank]) { + err = ompi_datatype_sndrcv(result_buf + disps[rank] * extent, + rcounts[rank], dtype, + rbuf, rcounts[rank], dtype); + if (OMPI_SUCCESS != err) { + free(tmp_rcounts); + free(tmp_disps); + goto cleanup; + } + } - free(tmp_rcounts); - free(tmp_disps); + free(tmp_rcounts); + free(tmp_disps); } /* Now fix up the non-power of two case, by having the odd procs send the even procs the proper results */ if (rank < (2 * remain)) { - if ((rank & 1) == 0) { - if (rcounts[rank]) { - err = MCA_PML_CALL(recv(rbuf, rcounts[rank], dtype, rank + 1, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, - comm, MPI_STATUS_IGNORE)); - if (OMPI_SUCCESS != err) goto cleanup; - } - } else { - if (rcounts[rank - 1]) { - err = MCA_PML_CALL(send(result_buf + disps[rank - 1] * extent, - rcounts[rank - 1], dtype, rank - 1, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, - MCA_PML_BASE_SEND_STANDARD, - comm)); - if (OMPI_SUCCESS != err) goto cleanup; - } - } + if ((rank & 1) == 0) { + if (rcounts[rank]) { + err = MCA_PML_CALL(recv(rbuf, rcounts[rank], dtype, rank + 1, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, + comm, MPI_STATUS_IGNORE)); + if (OMPI_SUCCESS != err) goto cleanup; + } + } else { + if (rcounts[rank - 1]) { + err = MCA_PML_CALL(send(result_buf + disps[rank - 1] * extent, + rcounts[rank - 1], dtype, rank - 1, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, + MCA_PML_BASE_SEND_STANDARD, + comm)); + if (OMPI_SUCCESS != err) goto cleanup; + } + } } cleanup: @@ -455,27 +449,22 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + mca_coll_base_module_t *module) { - int ret, line; - int rank, size, i, k, recv_from, send_to; - int total_count, max_block_count; - int inbi; - int *displs = NULL; - size_t typelng; - char *tmpsend = NULL, *tmprecv = NULL; - char *inbuf_free[2] = {NULL, NULL}; - char *inbuf[2] = {NULL, NULL}; - char *accumbuf = NULL, *accumbuf_free = NULL; + int ret, line, rank, size, i, k, recv_from, send_to, total_count, max_block_count; + int inbi, *displs = NULL; + char *tmpsend = NULL, *tmprecv = NULL, *accumbuf = NULL, *accumbuf_free = NULL; + char *inbuf_free[2] = {NULL, NULL}, *inbuf[2] = {NULL, NULL}; ptrdiff_t true_lb, true_extent, lb, extent, max_real_segsize; ompi_request_t *reqs[2] = {NULL, NULL}; + size_t typelng; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:reduce_scatter_intra_ring rank %d, size %d", - rank, size)); + "coll:tuned:reduce_scatter_intra_ring rank %d, size %d", + rank, size)); /* Determine the maximum number of elements per node, corresponding block size, and displacements array. @@ -486,20 +475,20 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, total_count = rcounts[0]; max_block_count = rcounts[0]; for (i = 1; i < size; i++) { - displs[i] = total_count; - total_count += rcounts[i]; - if (max_block_count < rcounts[i]) max_block_count = rcounts[i]; + displs[i] = total_count; + total_count += rcounts[i]; + if (max_block_count < rcounts[i]) max_block_count = rcounts[i]; } /* Special case for size == 1 */ if (1 == size) { - if (MPI_IN_PLACE != sbuf) { - ret = ompi_datatype_copy_content_same_ddt(dtype, total_count, - (char*)rbuf, (char*)sbuf); - if (ret < 0) { line = __LINE__; goto error_hndl; } - } - free(displs); - return MPI_SUCCESS; + if (MPI_IN_PLACE != sbuf) { + ret = ompi_datatype_copy_content_same_ddt(dtype, total_count, + (char*)rbuf, (char*)sbuf); + if (ret < 0) { line = __LINE__; goto error_hndl; } + } + free(displs); + return MPI_SUCCESS; } /* Allocate and initialize temporary buffers, we need: @@ -524,9 +513,9 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, if (NULL == inbuf_free[0]) { ret = -1; line = __LINE__; goto error_hndl; } inbuf[0] = inbuf_free[0] - lb; if (size > 2) { - inbuf_free[1] = (char*)malloc(max_real_segsize); - if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; } - inbuf[1] = inbuf_free[1] - lb; + inbuf_free[1] = (char*)malloc(max_real_segsize); + if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; } + inbuf[1] = inbuf_free[1] - lb; } /* Handle MPI_IN_PLACE for size > 1 */ @@ -535,7 +524,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, } ret = ompi_datatype_copy_content_same_ddt(dtype, total_count, - accumbuf, (char*)sbuf); + accumbuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } /* Computation loop */ @@ -561,41 +550,41 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, inbi = 0; /* Initialize first receive from the neighbor on the left */ ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, - &reqs[inbi])); + MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, + &reqs[inbi])); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } tmpsend = accumbuf + (ptrdiff_t)displs[recv_from] * extent; ret = MCA_PML_CALL(send(tmpsend, rcounts[recv_from], dtype, send_to, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, - MCA_PML_BASE_SEND_STANDARD, comm)); + MCA_COLL_BASE_TAG_REDUCE_SCATTER, + MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } for (k = 2; k < size; k++) { - const int prevblock = (rank + size - k) % size; + const int prevblock = (rank + size - k) % size; - inbi = inbi ^ 0x1; + inbi = inbi ^ 0x1; - /* Post irecv for the current block */ - ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, - &reqs[inbi])); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Post irecv for the current block */ + ret = MCA_PML_CALL(irecv(inbuf[inbi], max_block_count, dtype, recv_from, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, comm, + &reqs[inbi])); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Wait on previous block to arrive */ - ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* Wait on previous block to arrive */ + ret = ompi_request_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - /* Apply operation on previous block: result goes to rbuf - rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] - */ - tmprecv = accumbuf + (ptrdiff_t)displs[prevblock] * extent; - ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, rcounts[prevblock], dtype); + /* Apply operation on previous block: result goes to rbuf + rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock] + */ + tmprecv = accumbuf + (ptrdiff_t)displs[prevblock] * extent; + ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, rcounts[prevblock], dtype); - /* send previous block to send_to */ - ret = MCA_PML_CALL(send(tmprecv, rcounts[prevblock], dtype, send_to, - MCA_COLL_BASE_TAG_REDUCE_SCATTER, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + /* send previous block to send_to */ + ret = MCA_PML_CALL(send(tmprecv, rcounts[prevblock], dtype, send_to, + MCA_COLL_BASE_TAG_REDUCE_SCATTER, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } } /* Wait on the last block to arrive */ @@ -620,7 +609,7 @@ ompi_coll_tuned_reduce_scatter_intra_ring(void *sbuf, void *rbuf, int *rcounts, error_hndl: OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tRank %d Error occurred %d\n", - __FILE__, line, rank, ret)); + __FILE__, line, rank, ret)); if (NULL != displs) free(displs); if (NULL != accumbuf_free) free(accumbuf_free); if (NULL != inbuf_free[0]) free(inbuf_free[0]); @@ -652,10 +641,10 @@ int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_alg false, true, max_alg, NULL); mca_param_indices->algorithm_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "reduce_scatter_algorithm", - "Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring", - false, false, 0, NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "reduce_scatter_algorithm", + "Which reduce reduce_scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 non-overlapping (Reduce + Scatterv), 2 recursive halving, 3 ring", + false, false, 0, NULL); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; } @@ -695,30 +684,30 @@ int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_alg int ompi_coll_tuned_reduce_scatter_intra_do_forced(void *sbuf, void* rbuf, int *rcounts, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced selected algorithm %d", - data->user_forced[REDUCESCATTER].algorithm)); + data->user_forced[REDUCESCATTER].algorithm)); switch (data->user_forced[REDUCESCATTER].algorithm) { case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module); case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module); case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module); case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module); default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER])); - return (MPI_ERR_ARG); + OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", + data->user_forced[REDUCESCATTER].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER])); + return (MPI_ERR_ARG); } /* switch */ } @@ -728,25 +717,25 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(void *sbuf, void* rbuf, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, + mca_coll_base_module_t *module, int algorithm, int faninout, int segsize) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); + algorithm, faninout, segsize)); switch (algorithm) { case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed (sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module); case (1): return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module); case (2): return ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module); case (3): return ompi_coll_tuned_reduce_scatter_intra_ring (sbuf, rbuf, rcounts, - dtype, op, comm, module); + dtype, op, comm, module); default: - OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER])); - return (MPI_ERR_ARG); + OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCESCATTER])); + return (MPI_ERR_ARG); } /* switch */ } diff --git a/ompi/mca/coll/tuned/coll_tuned_scatter.c b/ompi/mca/coll/tuned/coll_tuned_scatter.c index e3d6bd1887..996f0cf7e5 100644 --- a/ompi/mca/coll/tuned/coll_tuned_scatter.c +++ b/ompi/mca/coll/tuned/coll_tuned_scatter.c @@ -32,22 +32,15 @@ int ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - int line = -1; - int i; - int rank; - int vrank; - int size; - int total_send = 0; - char *ptmp = NULL; - char *tempbuf = NULL; - int err; + int line = -1, i, rank, vrank, size, total_send = 0, err; + char *ptmp, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, slb, strue_lb, strue_extent; @@ -71,111 +64,109 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent); vrank = (rank - root + size) % size; + ptmp = (char *) rbuf; /* by default suppose leaf nodes, just use rbuf */ if (rank == root) { - if (0 == root) { - /* root on 0, just use the send buffer */ - ptmp = (char *) sbuf; - if (rbuf != MPI_IN_PLACE) { - /* local copy to rbuf */ - err = ompi_datatype_sndrcv(sbuf, scount, sdtype, - rbuf, rcount, rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } - } else { - /* root is not on 0, allocate temp buffer for send */ - tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent); - if (NULL == tempbuf) { - err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; - } + if (0 == root) { + /* root on 0, just use the send buffer */ + ptmp = (char *) sbuf; + if (rbuf != MPI_IN_PLACE) { + /* local copy to rbuf */ + err = ompi_datatype_sndrcv(sbuf, scount, sdtype, + rbuf, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + } else { + /* root is not on 0, allocate temp buffer for send */ + tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent); + if (NULL == tempbuf) { + err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; + } - ptmp = tempbuf - slb; + ptmp = tempbuf - slb; - /* and rotate data so they will eventually in the right place */ - err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root), - ptmp, (char *) sbuf + sextent * (ptrdiff_t)root * (ptrdiff_t)scount); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* and rotate data so they will eventually in the right place */ + err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root), + ptmp, (char *) sbuf + sextent * (ptrdiff_t)root * (ptrdiff_t)scount); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)root, - ptmp + sextent * (ptrdiff_t)scount * (ptrdiff_t)(size - root), (char *)sbuf); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)root, + ptmp + sextent * (ptrdiff_t)scount * (ptrdiff_t)(size - root), (char *)sbuf); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - if (rbuf != MPI_IN_PLACE) { - /* local copy to rbuf */ - err = ompi_datatype_sndrcv(ptmp, scount, sdtype, - rbuf, rcount, rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } - } - total_send = scount; + if (rbuf != MPI_IN_PLACE) { + /* local copy to rbuf */ + err = ompi_datatype_sndrcv(ptmp, scount, sdtype, + rbuf, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + } + total_send = scount; } else if (!(vrank % 2)) { - /* non-root, non-leaf nodes, allocte temp buffer for recv - * the most we need is rcount*size/2 */ - tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent); - if (NULL == tempbuf) { - err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; - } + /* non-root, non-leaf nodes, allocte temp buffer for recv + * the most we need is rcount*size/2 */ + tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent); + if (NULL == tempbuf) { + err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; + } - ptmp = tempbuf - rlb; + ptmp = tempbuf - rlb; - sdtype = rdtype; - scount = rcount; - sextent = rextent; - total_send = scount; - } else { - /* leaf nodes, just use rbuf */ - ptmp = (char *) rbuf; + sdtype = rdtype; + scount = rcount; + sextent = rextent; + total_send = scount; } if (!(vrank % 2)) { - if (rank != root) { - /* recv from parent on non-root */ - err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)rcount * (ptrdiff_t)size, rdtype, bmtree->tree_prev, - MCA_COLL_BASE_TAG_SCATTER, comm, &status)); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - /* local copy to rbuf */ - err = ompi_datatype_sndrcv(ptmp, scount, sdtype, - rbuf, rcount, rdtype); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - } - /* send to children on all non-leaf */ - for (i = 0; i < bmtree->tree_nextsize; i++) { - size_t mycount = 0; + if (rank != root) { + /* recv from parent on non-root */ + err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)rcount * (ptrdiff_t)size, rdtype, bmtree->tree_prev, + MCA_COLL_BASE_TAG_SCATTER, comm, &status)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* local copy to rbuf */ + err = ompi_datatype_sndrcv(ptmp, scount, sdtype, + rbuf, rcount, rdtype); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + } + /* send to children on all non-leaf */ + for (i = 0; i < bmtree->tree_nextsize; i++) { + size_t mycount = 0; int vkid; - /* figure out how much data I have to send to this child */ - vkid = (bmtree->tree_next[i] - root + size) % size; - mycount = vkid - vrank; - if( (int)mycount > (size - vkid) ) - mycount = size - vkid; - mycount *= scount; + /* figure out how much data I have to send to this child */ + vkid = (bmtree->tree_next[i] - root + size) % size; + mycount = vkid - vrank; + if( (int)mycount > (size - vkid) ) + mycount = size - vkid; + mycount *= scount; - err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)total_send * sextent, mycount, sdtype, - bmtree->tree_next[i], - MCA_COLL_BASE_TAG_SCATTER, - MCA_PML_BASE_SEND_STANDARD, comm)); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)total_send * sextent, mycount, sdtype, + bmtree->tree_next[i], + MCA_COLL_BASE_TAG_SCATTER, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - total_send += mycount; - } + total_send += mycount; + } - if (NULL != tempbuf) - free(tempbuf); + if (NULL != tempbuf) + free(tempbuf); } else { - /* recv from parent on leaf nodes */ - err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, bmtree->tree_prev, - MCA_COLL_BASE_TAG_SCATTER, comm, &status)); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + /* recv from parent on leaf nodes */ + err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, bmtree->tree_prev, + MCA_COLL_BASE_TAG_SCATTER, comm, &status)); + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) - free(tempbuf); + free(tempbuf); OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", - __FILE__, line, err, rank)); + __FILE__, line, err, rank)); return err; } @@ -201,16 +192,16 @@ ompi_coll_tuned_scatter_intra_binomial(void *sbuf, int scount, */ int ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { int i, rank, size, err; - char *ptmp; ptrdiff_t lb, incr; + char *ptmp; /* Initialize */ @@ -242,7 +233,7 @@ ompi_coll_tuned_scatter_intra_basic_linear(void *sbuf, int scount, if (MPI_IN_PLACE != rbuf) { err = ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount, - rdtype); + rdtype); } } else { err = MCA_PML_CALL(send(ptmp, scount, sdtype, i, @@ -281,123 +272,123 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p ompi_coll_tuned_forced_max_algorithms[SCATTER] = max_alg; rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm_count", - "Number of scatter algorithms available", - false, true, max_alg, NULL); + "scatter_algorithm_count", + "Number of scatter algorithms available", + false, true, max_alg, NULL); mca_param_indices->algorithm_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm", - "Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.", - false, false, 0, NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "scatter_algorithm", + "Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.", + false, false, 0, NULL); if (mca_param_indices->algorithm_param_index < 0) { return mca_param_indices->algorithm_param_index; } mca_base_param_lookup_int(mca_param_indices->algorithm_param_index, &(requested_alg)); if( 0 > requested_alg || requested_alg > max_alg ) { - if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) { - opal_output( 0, "Scatter algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n", - requested_alg, max_alg ); - } - mca_base_param_set_int( mca_param_indices->algorithm_param_index, 0); + if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) { + opal_output( 0, "Scatter algorithm #%d is not available (range [0..%d]). Switching back to ignore(0)\n", + requested_alg, max_alg ); + } + mca_base_param_set_int( mca_param_indices->algorithm_param_index, 0); } mca_param_indices->segsize_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm_segmentsize", - "Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.", - false, false, 0, NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "scatter_algorithm_segmentsize", + "Segment size in bytes used by default for scatter algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation. Currently, available algorithms do not support segmentation.", + false, false, 0, NULL); mca_param_indices->tree_fanout_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm_tree_fanout", - "Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.", - false, false, - ompi_coll_tuned_init_tree_fanout, /* get system wide default */ - NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "scatter_algorithm_tree_fanout", + "Fanout for n-tree used for scatter algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation. Currently, available algorithms do not support n-tree topologies.", + false, false, + ompi_coll_tuned_init_tree_fanout, /* get system wide default */ + NULL); mca_param_indices->chain_fanout_param_index - = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, - "scatter_algorithm_chain_fanout", - "Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.", - false, false, - ompi_coll_tuned_init_chain_fanout, /* get system wide default */ - NULL); + = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version, + "scatter_algorithm_chain_fanout", + "Fanout for chains used for scatter algorithms. Only has meaning if algorithm is forced and supports chain topo based operation. Currently, available algorithms do not support chain topologies.", + false, false, + ompi_coll_tuned_init_chain_fanout, /* get system wide default */ + NULL); return (MPI_SUCCESS); } int ompi_coll_tuned_scatter_intra_do_forced(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:scatter_intra_do_forced selected algorithm %d", - data->user_forced[SCATTER].algorithm)); + "coll:tuned:scatter_intra_do_forced selected algorithm %d", + data->user_forced[SCATTER].algorithm)); switch (data->user_forced[SCATTER].algorithm) { case (0): - return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); + return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); case (1): - return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - case (2): - return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", - data->user_forced[SCATTER].algorithm, - ompi_coll_tuned_forced_max_algorithms[SCATTER])); - return (MPI_ERR_ARG); + return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); + case (2): + return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); + default: + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:scatter_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?", + data->user_forced[SCATTER].algorithm, + ompi_coll_tuned_forced_max_algorithms[SCATTER])); + return (MPI_ERR_ARG); } /* switch */ } int ompi_coll_tuned_scatter_intra_do_this(void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module, - int algorithm, int faninout, int segsize) + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module, + int algorithm, int faninout, int segsize) { OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", - algorithm, faninout, segsize)); + "coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d", + algorithm, faninout, segsize)); switch (algorithm) { case (0): - return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); + return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); case (1): - return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); + return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); case (2): - return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, module); - default: - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", - algorithm, - ompi_coll_tuned_forced_max_algorithms[SCATTER])); - return (MPI_ERR_ARG); - } /* switch */ + return ompi_coll_tuned_scatter_intra_binomial(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module); + default: + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", + algorithm, + ompi_coll_tuned_forced_max_algorithms[SCATTER])); + return (MPI_ERR_ARG); + } /* switch */ } diff --git a/ompi/mca/coll/tuned/coll_tuned_topo.c b/ompi/mca/coll/tuned/coll_tuned_topo.c index 0f903015ae..8aa469de5e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_topo.c +++ b/ompi/mca/coll/tuned/coll_tuned_topo.c @@ -77,13 +77,10 @@ ompi_coll_tuned_topo_build_tree( int fanout, struct ompi_communicator_t* comm, int root ) { - int rank, size; - int schild, sparent; + int rank, size, schild, sparent, shiftedrank, i; int level; /* location of my rank in the tree structure of size */ int delta; /* number of nodes on my level */ int slimit; /* total number of nodes on levels above me */ - int shiftedrank; - int i; ompi_coll_tree_t* tree; OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree Building fo %d rt %d", fanout, root)); @@ -192,9 +189,7 @@ ompi_coll_tuned_topo_build_tree( int fanout, ompi_coll_tree_t* ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm ) { - int rank, size; - int myrank, rightsize, delta; - int parent, lchild, rchild; + int rank, size, myrank, rightsize, delta, parent, lchild, rchild; ompi_coll_tree_t* tree; /* @@ -329,14 +324,8 @@ ompi_coll_tree_t* ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, int root ) { - int childs = 0; - int rank; - int size; - int mask = 1; - int index; - int remote; + int childs = 0, rank, size, mask = 1, index, remote, i; ompi_coll_tree_t *bmtree; - int i; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree rt %d", root)); @@ -358,7 +347,7 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, bmtree->tree_root = MPI_UNDEFINED; bmtree->tree_nextsize = MPI_UNDEFINED; - for(i=0;itree_next[i] = -1; } @@ -409,15 +398,10 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, */ ompi_coll_tree_t* ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, - int root ) + int root ) { - int childs = 0; - int rank, vrank; - int size; - int mask = 1; - int remote; + int childs = 0, rank, vrank, size, mask = 1, remote, i; ompi_coll_tree_t *bmtree; - int i; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_in_order_bmtree rt %d", root)); @@ -443,25 +427,25 @@ ompi_coll_tuned_topo_build_in_order_bmtree( struct ompi_communicator_t* comm, } if (root == rank) { - bmtree->tree_prev = root; + bmtree->tree_prev = root; } while (mask < size) { - remote = vrank ^ mask; - if (remote < vrank) { - bmtree->tree_prev = (remote + root) % size; - break; - } else if (remote < size) { - bmtree->tree_next[childs] = (remote + root) % size; - childs++; - if (childs==MAXTREEFANOUT) { - OPAL_OUTPUT((ompi_coll_tuned_stream, - "coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", - MAXTREEFANOUT, childs)); - return NULL; - } - } - mask <<= 1; + remote = vrank ^ mask; + if (remote < vrank) { + bmtree->tree_prev = (remote + root) % size; + break; + } else if (remote < size) { + bmtree->tree_next[childs] = (remote + root) % size; + childs++; + if (childs==MAXTREEFANOUT) { + OPAL_OUTPUT((ompi_coll_tuned_stream, + "coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", + MAXTREEFANOUT, childs)); + return NULL; + } + } + mask <<= 1; } bmtree->tree_nextsize = childs; bmtree->tree_root = root; @@ -475,10 +459,7 @@ ompi_coll_tuned_topo_build_chain( int fanout, struct ompi_communicator_t* comm, int root ) { - int rank, size; - int srank; /* shifted rank */ - int i,maxchainlen; - int mark,head,len; + int i, maxchainlen, mark, head, len, rank, size, srank /* shifted rank */; ompi_coll_tree_t *chain; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain fo %d rt %d", fanout, root)); diff --git a/ompi/mca/coll/tuned/coll_tuned_util.h b/ompi/mca/coll/tuned/coll_tuned_util.h index 50be3f5d2a..0a6c041ac4 100644 --- a/ompi/mca/coll/tuned/coll_tuned_util.h +++ b/ompi/mca/coll/tuned/coll_tuned_util.h @@ -52,7 +52,7 @@ ompi_coll_tuned_sendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdataty { if ((dest == myid) && (source == myid)) { return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype, - recvbuf, (int32_t) rcount, rdatatype); + recvbuf, (int32_t) rcount, rdatatype); } return ompi_coll_tuned_sendrecv_actual (sendbuf, scount, sdatatype, dest, stag, @@ -85,7 +85,7 @@ ompi_coll_tuned_sendrecv_localcompleted( void* sendbuf, size_t scount, { if ((dest == myid) && (source == myid)) { return (int) ompi_datatype_sndrcv(sendbuf, (int32_t) scount, sdatatype, - recvbuf, (int32_t) rcount, rdatatype); + recvbuf, (int32_t) rcount, rdatatype); } return ompi_coll_tuned_sendrecv_actual_localcompleted (sendbuf, scount, sdatatype, dest, @@ -103,20 +103,20 @@ ompi_coll_tuned_isendrecv( void* sendbuf, size_t scount, ompi_datatype_t* sdtype void* recvbuf, size_t rcount, ompi_datatype_t* rdtype, int source, int rtag, ompi_request_t** rreq, struct ompi_communicator_t* comm ) { - int ret, line; + int ret, line; - ret = MCA_PML_CALL(irecv(recvbuf, rcount, rdtype, source, rtag, comm, rreq)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; } + ret = MCA_PML_CALL(irecv(recvbuf, rcount, rdtype, source, rtag, comm, rreq)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; } - ret = MCA_PML_CALL(isend(sendbuf, scount, sdtype, dest, stag, - MCA_PML_BASE_SEND_STANDARD, comm, sreq)); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; } + ret = MCA_PML_CALL(isend(sendbuf, scount, sdtype, dest, stag, + MCA_PML_BASE_SEND_STANDARD, comm, sreq)); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_handler; } - return MPI_SUCCESS; + return MPI_SUCCESS; error_handler: - OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%d\tError occurred %d\n", - __FILE__, line, ret)); - return ret; + OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%d\tError occurred %d\n", + __FILE__, line, ret)); + return ret; } END_C_DECLS