diff --git a/NEWS b/NEWS index d7b29d3f71..868e6bc3ce 100644 --- a/NEWS +++ b/NEWS @@ -63,6 +63,7 @@ included in the vX.Y.Z section and be denoted as: - OFI/libfabric: Added support for multiple NICs - OFI/libfabric: Added support for Scalable Endpoints - OFI/libfabric: Added btl for one-sided support +- libnbc: Adding numerous performance-improving algorithms 4.0.4 -- June, 2020 ----------------------- diff --git a/contrib/platform/mellanox/optimized.conf b/contrib/platform/mellanox/optimized.conf index 048d85f342..b86b37c9e2 100644 --- a/contrib/platform/mellanox/optimized.conf +++ b/contrib/platform/mellanox/optimized.conf @@ -10,6 +10,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2019 Mellanox Technologies. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -84,4 +85,8 @@ bml_r2_show_unreach_errors = 0 coll_tuned_alltoall_large_msg = 250000 coll_tuned_alltoall_min_procs = 2048 coll_tuned_alltoall_algorithm_max_requests = 8 +coll_tuned_scatter_intermediate_msg = 8192 +coll_tuned_scatter_large_msg = 250000 +coll_tuned_scatter_min_procs = 1048510 +coll_tuned_scatter_algorithm_max_requests = 64 diff --git a/ompi/mca/coll/base/coll_base_allgatherv.c b/ompi/mca/coll/base/coll_base_allgatherv.c index 08762810db..c3ced000a0 100644 --- a/ompi/mca/coll/base/coll_base_allgatherv.c +++ b/ompi/mca/coll/base/coll_base_allgatherv.c @@ -110,9 +110,6 @@ int ompi_coll_base_allgatherv_intra_bruck(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allgather_intra_bruck rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -238,9 +235,6 @@ int ompi_coll_base_allgatherv_intra_ring(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allgatherv_intra_ring rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -381,9 +375,6 @@ ompi_coll_base_allgatherv_intra_neighborexchange(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allgatherv_intra_neighborexchange rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } @@ -520,9 +511,6 @@ int ompi_coll_base_allgatherv_intra_two_procs(const void *sbuf, int scount, return MPI_ERR_UNSUPPORTED_OPERATION; } - err = ompi_datatype_get_extent (sdtype, &lb, &sext); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_extent (rdtype, &lb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index eeb1d35fb4..828b32061a 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -350,7 +350,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count, char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL}; ptrdiff_t true_lb, true_extent, lb, extent; ptrdiff_t block_offset, max_real_segsize; - ompi_request_t *reqs[2] = {NULL, NULL}; + ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -528,6 +528,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count, error_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret)); + ompi_coll_base_free_reqs(reqs, 2); (void)line; // silence compiler warning if (NULL != inbuf[0]) free(inbuf[0]); if (NULL != inbuf[1]) free(inbuf[1]); @@ -627,7 +628,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int size_t typelng; char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL}; ptrdiff_t block_offset, max_real_segsize; - ompi_request_t *reqs[2] = {NULL, NULL}; + ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; ptrdiff_t lb, extent, gap; size = ompi_comm_size(comm); @@ -847,6 +848,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int error_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n", __FILE__, line, rank, ret)); + ompi_coll_base_free_reqs(reqs, 2); (void)line; // silence compiler warning if (NULL != inbuf[0]) free(inbuf[0]); if (NULL != inbuf[1]) free(inbuf[1]); diff --git a/ompi/mca/coll/base/coll_base_alltoall.c b/ompi/mca/coll/base/coll_base_alltoall.c index 3f1bdc5fb5..a61bf40ca9 100644 --- a/ompi/mca/coll/base/coll_base_alltoall.c +++ b/ompi/mca/coll/base/coll_base_alltoall.c @@ -393,6 +393,7 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount, if (0 < total_reqs) { reqs = ompi_coll_base_comm_get_reqs(module->base_data, 2 * total_reqs); if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; } + reqs[0] = reqs[1] = MPI_REQUEST_NULL; } prcv = (char *) rbuf; @@ -468,6 +469,15 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount, return MPI_SUCCESS; error_hndl: + /* find a real error code */ + if (MPI_ERR_IN_STATUS == error) { + for( ri = 0; ri < nreqs; ri++ ) { + if (MPI_REQUEST_NULL == reqs[ri]) continue; + if (MPI_ERR_PENDING == reqs[ri]->req_status.MPI_ERROR) continue; + error = reqs[ri]->req_status.MPI_ERROR; + break; + } + } OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error, rank)); @@ -661,7 +671,16 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount, if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err_hndl: - if( MPI_SUCCESS != err ) { + if (MPI_SUCCESS != err) { + /* find a real error code */ + if (MPI_ERR_IN_STATUS == err) { + for( i = 0; i < nreqs; i++ ) { + if (MPI_REQUEST_NULL == req[i]) continue; + if (MPI_ERR_PENDING == req[i]->req_status.MPI_ERROR) continue; + err = req[i]->req_status.MPI_ERROR; + break; + } + } OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning diff --git a/ompi/mca/coll/base/coll_base_alltoallv.c b/ompi/mca/coll/base/coll_base_alltoallv.c index aec8b85944..dbe33e8eee 100644 --- a/ompi/mca/coll/base/coll_base_alltoallv.c +++ b/ompi/mca/coll/base/coll_base_alltoallv.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -276,6 +276,15 @@ ompi_coll_base_alltoallv_intra_basic_linear(const void *sbuf, const int *scounts err = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); err_hndl: + /* find a real error code */ + if (MPI_ERR_IN_STATUS == err) { + for( i = 0; i < nreqs; i++ ) { + if (MPI_REQUEST_NULL == reqs[i]) continue; + if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue; + err = reqs[i]->req_status.MPI_ERROR; + break; + } + } /* Free the requests in all cases as they are persistent */ ompi_coll_base_free_reqs(reqs, nreqs); diff --git a/ompi/mca/coll/base/coll_base_barrier.c b/ompi/mca/coll/base/coll_base_barrier.c index a190f3be72..49ac4ea2e9 100644 --- a/ompi/mca/coll/base/coll_base_barrier.c +++ b/ompi/mca/coll/base/coll_base_barrier.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -102,8 +102,10 @@ int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm, { int rank, size, err = 0, line = 0, left, right; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return OMPI_SUCCESS; + rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank)); @@ -172,8 +174,10 @@ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *c { int rank, size, adjsize, err, line, mask, remote; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return OMPI_SUCCESS; + rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_recursivedoubling rank %d", rank)); @@ -251,8 +255,10 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm, { int rank, size, distance, to, from, err, line = 0; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return MPI_SUCCESS; + rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_bruck rank %d", rank)); @@ -285,16 +291,19 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm, int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int remote, err; + int remote, size, err; + + size = ompi_comm_size(comm); + if( 1 == size ) + return MPI_SUCCESS; + if( 2 != ompi_comm_size(comm) ) { + return MPI_ERR_UNSUPPORTED_OPERATION; + } remote = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_two_procs rank %d", remote)); - if (2 != ompi_comm_size(comm)) { - return MPI_ERR_UNSUPPORTED_OPERATION; - } - remote = (remote + 1) & 0x1; err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, @@ -324,8 +333,10 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, int i, err, rank, size, line; ompi_request_t** requests = NULL; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return MPI_SUCCESS; + rank = ompi_comm_rank(comm); /* All non-root send & receive zero-length message. */ if (rank > 0) { @@ -367,11 +378,21 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, /* All done */ return MPI_SUCCESS; err_hndl: + if( NULL != requests ) { + /* find a real error code */ + if (MPI_ERR_IN_STATUS == err) { + for( i = 0; i < size; i++ ) { + if (MPI_REQUEST_NULL == requests[i]) continue; + if (MPI_ERR_PENDING == requests[i]->req_status.MPI_ERROR) continue; + err = requests[i]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(requests, size); + } OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning - if( NULL != requests ) - ompi_coll_base_free_reqs(requests, size); return err; } /* copied function (with appropriate renaming) ends here */ @@ -385,8 +406,10 @@ int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm, { int rank, size, depth, err, jump, partner; - rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); + if( 1 == size ) + return MPI_SUCCESS; + rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_tree %d", rank)); diff --git a/ompi/mca/coll/base/coll_base_bcast.c b/ompi/mca/coll/base/coll_base_bcast.c index a35e18fa9c..7af75353d2 100644 --- a/ompi/mca/coll/base/coll_base_bcast.c +++ b/ompi/mca/coll/base/coll_base_bcast.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -214,13 +214,29 @@ ompi_coll_base_bcast_intra_generic( void* buffer, return (MPI_SUCCESS); error_hndl: + if (MPI_ERR_IN_STATUS == err) { + for( req_index = 0; req_index < 2; req_index++ ) { + if (MPI_REQUEST_NULL == recv_reqs[req_index]) continue; + if (MPI_ERR_PENDING == recv_reqs[req_index]->req_status.MPI_ERROR) continue; + err = recv_reqs[req_index]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs( recv_reqs, 2); + if( NULL != send_reqs ) { + if (MPI_ERR_IN_STATUS == err) { + for( req_index = 0; req_index < tree->tree_nextsize; req_index++ ) { + if (MPI_REQUEST_NULL == send_reqs[req_index]) continue; + if (MPI_ERR_PENDING == send_reqs[req_index]->req_status.MPI_ERROR) continue; + err = send_reqs[req_index]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize); + } OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warnings - ompi_coll_base_free_reqs( recv_reqs, 2); - if( NULL != send_reqs ) { - ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize); - } return err; } @@ -630,7 +646,9 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count, /* Root sends data to all others. */ preq = reqs = ompi_coll_base_comm_get_reqs(module->base_data, size-1); - if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl; } + if( NULL == reqs ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } for (i = 0; i < size; ++i) { if (i == rank) { @@ -649,12 +667,21 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count, * care what the error was -- just that there *was* an error. The * PML will finish all requests, even if one or more of them fail. * i.e., by the end of this call, all the requests are free-able. - * So free them anyway -- even if there was an error, and return - * the error after we free everything. */ + * So free them anyway -- even if there was an error. + * Note we still need to get the actual error, as collective + * operations cannot return MPI_ERR_IN_STATUS. + */ err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE); err_hndl: if( MPI_SUCCESS != err ) { /* Free the reqs */ + /* first find the real error code */ + for( preq = reqs; preq < reqs+i; preq++ ) { + if (MPI_REQUEST_NULL == *preq) continue; + if (MPI_ERR_PENDING == (*preq)->req_status.MPI_ERROR) continue; + err = (*preq)->req_status.MPI_ERROR; + break; + } ompi_coll_base_free_reqs(reqs, i); } diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h index 40de8762eb..11b46ba47e 100644 --- a/ompi/mca/coll/base/coll_base_functions.h +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -18,6 +18,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -291,6 +292,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS); /* Scatter */ int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS); int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS); +int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs); /* ScatterV */ diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c index 8d5ab70d70..6fd1e98146 100644 --- a/ompi/mca/coll/base/coll_base_gather.c +++ b/ompi/mca/coll/base/coll_base_gather.c @@ -326,6 +326,15 @@ ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount, return MPI_SUCCESS; error_hndl: if (NULL != reqs) { + /* find a real error code */ + if (MPI_ERR_IN_STATUS == ret) { + for( i = 0; i < size; i++ ) { + if (MPI_REQUEST_NULL == reqs[i]) continue; + if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue; + ret = reqs[i]->req_status.MPI_ERROR; + break; + } + } ompi_coll_base_free_reqs(reqs, size); } OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index 82838ddbcd..dfd709bfb9 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -338,16 +338,34 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int origi return OMPI_SUCCESS; error_hndl: /* error handler */ + /* find a real error code */ + if (MPI_ERR_IN_STATUS == ret) { + for( i = 0; i < 2; i++ ) { + if (MPI_REQUEST_NULL == reqs[i]) continue; + if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue; + ret = reqs[i]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(reqs, 2); + if( NULL != sreq ) { + if (MPI_ERR_IN_STATUS == ret) { + for( i = 0; i < max_outstanding_reqs; i++ ) { + if (MPI_REQUEST_NULL == sreq[i]) continue; + if (MPI_ERR_PENDING == sreq[i]->req_status.MPI_ERROR) continue; + ret = sreq[i]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(sreq, max_outstanding_reqs); + } + if( inbuf_free[0] != NULL ) free(inbuf_free[0]); + if( inbuf_free[1] != NULL ) free(inbuf_free[1]); + if( accumbuf_free != NULL ) free(accumbuf); OPAL_OUTPUT (( ompi_coll_base_framework.framework_output, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); (void)line; // silence compiler warning - if( inbuf_free[0] != NULL ) free(inbuf_free[0]); - if( inbuf_free[1] != NULL ) free(inbuf_free[1]); - if( accumbuf_free != NULL ) free(accumbuf); - if( NULL != sreq ) { - ompi_coll_base_free_reqs(sreq, max_outstanding_reqs); - } return ret; } diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c index 948a17376c..984a91787a 100644 --- a/ompi/mca/coll/base/coll_base_reduce_scatter.c +++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c @@ -464,7 +464,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, const in char *tmpsend = NULL, *tmprecv = NULL, *accumbuf = NULL, *accumbuf_free = NULL; char *inbuf_free[2] = {NULL, NULL}, *inbuf[2] = {NULL, NULL}; ptrdiff_t extent, max_real_segsize, dsize, gap = 0; - ompi_request_t *reqs[2] = {NULL, NULL}; + ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c index 648845689d..0ca3597153 100644 --- a/ompi/mca/coll/base/coll_base_scatter.c +++ b/ompi/mca/coll/base/coll_base_scatter.c @@ -14,6 +14,7 @@ * reserved. * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -273,5 +274,114 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount, return MPI_SUCCESS; } - /* copied function (with appropriate renaming) ends here */ + +/* + * Use isends for distributing the data with periodic sync by blocking send. + * Blocking send acts like a local resources flush, because it ensures + * progression until the message is sent/(copied to some sort of transmit buffer). + */ +int +ompi_coll_base_scatter_intra_linear_nb(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module, + int max_reqs) +{ + int i, rank, size, err, line, nreqs; + ptrdiff_t incr; + char *ptmp; + ompi_request_t **reqs = NULL, **preq; + + rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + + /* If not root, receive data. */ + if (rank != root) { + err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root, + MCA_COLL_BASE_TAG_SCATTER, + comm, MPI_STATUS_IGNORE)); + if (MPI_SUCCESS != err) { + line = __LINE__; goto err_hndl; + } + + return MPI_SUCCESS; + } + + if (max_reqs <= 1) { + max_reqs = 0; + nreqs = size - 1; /* no send for myself */ + } else { + /* We use blocking MPI_Send (which does not need a request) + * every max_reqs send operation (which is size/max_reqs at most), + * therefore no need to allocate requests for these sends. */ + nreqs = size - (size / max_reqs); + } + + reqs = ompi_coll_base_comm_get_reqs(module->base_data, nreqs); + if (NULL == reqs) { + err = OMPI_ERR_OUT_OF_RESOURCE; + line = __LINE__; goto err_hndl; + } + + err = ompi_datatype_type_extent(sdtype, &incr); + if (OMPI_SUCCESS != err) { + line = __LINE__; goto err_hndl; + } + incr *= scount; + + /* I am the root, loop sending data. */ + for (i = 0, ptmp = (char *)sbuf, preq = reqs; i < size; ++i, ptmp += incr) { + /* simple optimization */ + if (i == rank) { + if (MPI_IN_PLACE != rbuf) { + err = ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount, + rdtype); + } + } else { + if (!max_reqs || (i % max_reqs)) { + err = MCA_PML_CALL(isend(ptmp, scount, sdtype, i, + MCA_COLL_BASE_TAG_SCATTER, + MCA_PML_BASE_SEND_STANDARD, + comm, preq++)); + } else { + err = MCA_PML_CALL(send(ptmp, scount, sdtype, i, + MCA_COLL_BASE_TAG_SCATTER, + MCA_PML_BASE_SEND_STANDARD, + comm)); + } + } + if (MPI_SUCCESS != err) { + line = __LINE__; goto err_hndl; + } + } + + err = ompi_request_wait_all(preq - reqs, reqs, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != err) { + line = __LINE__; goto err_hndl; + } + + return MPI_SUCCESS; + +err_hndl: + if (NULL != reqs) { + /* find a real error code */ + if (MPI_ERR_IN_STATUS == err) { + for (i = 0; i < nreqs; i++) { + if (MPI_REQUEST_NULL == reqs[i]) continue; + if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue; + err = reqs[i]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(reqs, nreqs); + } + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); + (void)line; /* silence compiler warning */ + return err; +} + diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 5736c0946f..422894e45f 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -42,7 +42,7 @@ int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount, { /* post receive first, then send, then wait... should be fast (I hope) */ int err, line = 0; size_t rtypesize, stypesize; - ompi_request_t *req; + ompi_request_t *req = MPI_REQUEST_NULL; ompi_status_public_t rstatus; /* post new irecv */ diff --git a/ompi/mca/coll/cuda/coll_cuda_module.c b/ompi/mca/coll/cuda/coll_cuda_module.c index d8702dd378..137f55a763 100644 --- a/ompi/mca/coll/cuda/coll_cuda_module.c +++ b/ompi/mca/coll/cuda/coll_cuda_module.c @@ -3,6 +3,8 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2019 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -19,8 +21,8 @@ #include "mpi.h" -#include "orte/util/show_help.h" -#include "orte/util/proc_info.h" +#include "opal/util/show_help.h" +#include "ompi/mca/rte/rte.h" #include "ompi/constants.h" #include "ompi/communicator/communicator.h" @@ -146,8 +148,8 @@ int mca_coll_cuda_module_enable(mca_coll_base_module_t *module, if (good) { return OMPI_SUCCESS; } - orte_show_help("help-mpi-coll-cuda.txt", "missing collective", true, - orte_process_info.nodename, + opal_show_help("help-mpi-coll-cuda.txt", "missing collective", true, + ompi_process_info.nodename, mca_coll_cuda_component.priority, msg); return OMPI_ERR_NOT_FOUND; } diff --git a/ompi/mca/coll/hcoll/coll_hcoll.h b/ompi/mca/coll/hcoll/coll_hcoll.h index aaecbc11fe..d7bb79658e 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll.h +++ b/ompi/mca/coll/hcoll/coll_hcoll.h @@ -138,6 +138,8 @@ struct mca_coll_hcoll_module_t { mca_coll_base_module_t *previous_gather_module; mca_coll_base_module_gatherv_fn_t previous_gatherv; mca_coll_base_module_t *previous_gatherv_module; + mca_coll_base_module_scatterv_fn_t previous_scatterv; + mca_coll_base_module_t *previous_scatterv_module; mca_coll_base_module_reduce_scatter_fn_t previous_reduce_scatter; mca_coll_base_module_t *previous_reduce_scatter_module; mca_coll_base_module_ibcast_fn_t previous_ibcast; @@ -241,6 +243,15 @@ int mca_coll_hcoll_gatherv(const void* sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); + +int mca_coll_hcoll_scatterv(const void* sbuf, const int *scounts, const int *displs, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); + int mca_coll_hcoll_ibarrier(struct ompi_communicator_t *comm, ompi_request_t** request, mca_coll_base_module_t *module); diff --git a/ompi/mca/coll/hcoll/coll_hcoll_module.c b/ompi/mca/coll/hcoll/coll_hcoll_module.c index aa262c9849..7e638bb309 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll_module.c +++ b/ompi/mca/coll/hcoll/coll_hcoll_module.c @@ -45,6 +45,7 @@ static void mca_coll_hcoll_module_clear(mca_coll_hcoll_module_t *hcoll_module) hcoll_module->previous_allgatherv = NULL; hcoll_module->previous_gather = NULL; hcoll_module->previous_gatherv = NULL; + hcoll_module->previous_scatterv = NULL; hcoll_module->previous_alltoall = NULL; hcoll_module->previous_alltoallv = NULL; hcoll_module->previous_alltoallw = NULL; @@ -68,6 +69,7 @@ static void mca_coll_hcoll_module_clear(mca_coll_hcoll_module_t *hcoll_module) hcoll_module->previous_allgatherv_module = NULL; hcoll_module->previous_gather_module = NULL; hcoll_module->previous_gatherv_module = NULL; + hcoll_module->previous_scatterv_module = NULL; hcoll_module->previous_alltoall_module = NULL; hcoll_module->previous_alltoallv_module = NULL; hcoll_module->previous_alltoallw_module = NULL; @@ -120,6 +122,7 @@ static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_allgather_module); OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_allgatherv_module); OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_gatherv_module); + OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_scatterv_module); OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_alltoall_module); OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_alltoallv_module); OBJ_RELEASE_IF_NOT_NULL(hcoll_module->previous_reduce_module); @@ -174,6 +177,7 @@ static int mca_coll_hcoll_save_coll_handlers(mca_coll_hcoll_module_t *hcoll_modu HCOL_SAVE_PREV_COLL_API(allgather); HCOL_SAVE_PREV_COLL_API(allgatherv); HCOL_SAVE_PREV_COLL_API(gatherv); + HCOL_SAVE_PREV_COLL_API(scatterv); HCOL_SAVE_PREV_COLL_API(alltoall); HCOL_SAVE_PREV_COLL_API(alltoallv); @@ -392,6 +396,7 @@ mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority) hcoll_module->super.coll_alltoall = hcoll_collectives.coll_alltoall ? mca_coll_hcoll_alltoall : NULL; hcoll_module->super.coll_alltoallv = hcoll_collectives.coll_alltoallv ? mca_coll_hcoll_alltoallv : NULL; hcoll_module->super.coll_gatherv = hcoll_collectives.coll_gatherv ? mca_coll_hcoll_gatherv : NULL; + hcoll_module->super.coll_scatterv = hcoll_collectives.coll_scatterv ? mca_coll_hcoll_scatterv : NULL; hcoll_module->super.coll_reduce = hcoll_collectives.coll_reduce ? mca_coll_hcoll_reduce : NULL; hcoll_module->super.coll_ibarrier = hcoll_collectives.coll_ibarrier ? mca_coll_hcoll_ibarrier : NULL; hcoll_module->super.coll_ibcast = hcoll_collectives.coll_ibcast ? mca_coll_hcoll_ibcast : NULL; diff --git a/ompi/mca/coll/hcoll/coll_hcoll_ops.c b/ompi/mca/coll/hcoll/coll_hcoll_ops.c index de563e455b..5791fe17db 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll_ops.c +++ b/ompi/mca/coll/hcoll/coll_hcoll_ops.c @@ -397,6 +397,43 @@ int mca_coll_hcoll_gatherv(const void* sbuf, int scount, } +int mca_coll_hcoll_scatterv(const void* sbuf, const int *scounts, const int *displs, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + dte_data_representation_t stype; + dte_data_representation_t rtype; + int rc; + HCOL_VERBOSE(20,"RUNNING HCOL SCATTERV"); + mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module; + stype = ompi_dtype_2_hcoll_dtype(sdtype, NO_DERIVED); + rtype = ompi_dtype_2_hcoll_dtype(rdtype, NO_DERIVED); + if (OPAL_UNLIKELY(HCOL_DTE_IS_ZERO(stype) || HCOL_DTE_IS_ZERO(rtype))) { + /*If we are here then datatype is not simple predefined datatype */ + /*In future we need to add more complex mapping to the dte_data_representation_t */ + /* Now use fallback */ + HCOL_VERBOSE(20,"Ompi_datatype is not supported: sdtype = %s, rdtype = %s; calling fallback scatterv;", + sdtype->super.name, + rdtype->super.name); + rc = hcoll_module->previous_scatterv(sbuf, scounts, displs, sdtype, + rbuf, rcount, rdtype, root, + comm, hcoll_module->previous_scatterv_module); + return rc; + } + rc = hcoll_collectives.coll_scatterv((void *)sbuf, (int *)scounts, (int *)displs, stype, rbuf, rcount, rtype, root, hcoll_module->hcoll_context); + if (HCOLL_SUCCESS != rc){ + HCOL_VERBOSE(20,"RUNNING FALLBACK SCATTERV"); + rc = hcoll_module->previous_scatterv(sbuf, scounts, displs, sdtype, + rbuf, rcount, rdtype, root, + comm, hcoll_module->previous_scatterv_module); + } + return rc; +} + int mca_coll_hcoll_ibarrier(struct ompi_communicator_t *comm, ompi_request_t ** request, mca_coll_base_module_t *module) diff --git a/ompi/mca/coll/libnbc/coll_libnbc.h b/ompi/mca/coll/libnbc/coll_libnbc.h index 17abf86f2a..badc187077 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc.h +++ b/ompi/mca/coll/libnbc/coll_libnbc.h @@ -70,6 +70,13 @@ BEGIN_C_DECLS #define NBC_NUM_COLL 17 extern bool libnbc_ibcast_skip_dt_decision; +extern int libnbc_iallgather_algorithm; +extern int libnbc_iallreduce_algorithm; +extern int libnbc_ibcast_algorithm; +extern int libnbc_ibcast_knomial_radix; +extern int libnbc_iexscan_algorithm; +extern int libnbc_ireduce_algorithm; +extern int libnbc_iscan_algorithm; struct ompi_coll_libnbc_component_t { mca_coll_base_component_2_0_0_t super; diff --git a/ompi/mca/coll/libnbc/coll_libnbc_component.c b/ompi/mca/coll/libnbc/coll_libnbc_component.c index 2e23d2b739..bcb0e06c2d 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc_component.c +++ b/ompi/mca/coll/libnbc/coll_libnbc_component.c @@ -46,6 +46,59 @@ static int libnbc_priority = 10; static bool libnbc_in_progress = false; /* protect from recursive calls */ bool libnbc_ibcast_skip_dt_decision = true; +int libnbc_iallgather_algorithm = 0; /* iallgather user forced algorithm */ +static mca_base_var_enum_value_t iallgather_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "recursive_doubling"}, + {0, NULL} +}; + +int libnbc_iallreduce_algorithm = 0; /* iallreduce user forced algorithm */ +static mca_base_var_enum_value_t iallreduce_algorithms[] = { + {0, "ignore"}, + {1, "ring"}, + {2, "binomial"}, + {3, "rabenseifner"}, + {4, "recursive_doubling"}, + {0, NULL} +}; + +int libnbc_ibcast_algorithm = 0; /* ibcast user forced algorithm */ +int libnbc_ibcast_knomial_radix = 4; +static mca_base_var_enum_value_t ibcast_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "binomial"}, + {3, "chain"}, + {4, "knomial"}, + {0, NULL} +}; + +int libnbc_iexscan_algorithm = 0; /* iexscan user forced algorithm */ +static mca_base_var_enum_value_t iexscan_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "recursive_doubling"}, + {0, NULL} +}; + +int libnbc_ireduce_algorithm = 0; /* ireduce user forced algorithm */ +static mca_base_var_enum_value_t ireduce_algorithms[] = { + {0, "ignore"}, + {1, "chain"}, + {2, "binomial"}, + {3, "rabenseifner"}, + {0, NULL} +}; + +int libnbc_iscan_algorithm = 0; /* iscan user forced algorithm */ +static mca_base_var_enum_value_t iscan_algorithms[] = { + {0, "ignore"}, + {1, "linear"}, + {2, "recursive_doubling"}, + {0, NULL} +}; static int libnbc_open(void); static int libnbc_close(void); @@ -54,7 +107,6 @@ static int libnbc_init_query(bool, bool); static mca_coll_base_module_t *libnbc_comm_query(struct ompi_communicator_t *, int *); static int libnbc_module_enable(mca_coll_base_module_t *, struct ompi_communicator_t *); - /* * Instantiate the public struct with all of our public information * and pointers to our public functions in it @@ -128,6 +180,8 @@ libnbc_close(void) static int libnbc_register(void) { + mca_base_var_enum_t *new_enum = NULL; + /* Use a low priority, but allow other components to be lower */ libnbc_priority = 10; (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, @@ -158,11 +212,77 @@ libnbc_register(void) MCA_BASE_VAR_SCOPE_READONLY, &libnbc_ibcast_skip_dt_decision); + libnbc_iallgather_algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_iallgather_algorithms", iallgather_algorithms, &new_enum); + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iallgather_algorithm", + "Which iallgather algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, + &libnbc_iallgather_algorithm); + OBJ_RELEASE(new_enum); + + libnbc_iallreduce_algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_iallreduce_algorithms", iallreduce_algorithms, &new_enum); + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iallreduce_algorithm", + "Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner, 4 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, + &libnbc_iallreduce_algorithm); + OBJ_RELEASE(new_enum); + + libnbc_ibcast_algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ibcast_algorithms", ibcast_algorithms, &new_enum); + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_algorithm", + "Which ibcast algorithm is used: 0 ignore, 1 linear, 2 binomial, 3 chain, 4 knomial", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, + &libnbc_ibcast_algorithm); + OBJ_RELEASE(new_enum); + + libnbc_ibcast_knomial_radix = 4; + (void) mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ibcast_knomial_radix", "k-nomial tree radix for the ibcast algorithm (radix > 1)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &libnbc_ibcast_knomial_radix); + + libnbc_iexscan_algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_iexscan_algorithms", iexscan_algorithms, &new_enum); + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iexscan_algorithm", + "Which iexscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, + &libnbc_iexscan_algorithm); + OBJ_RELEASE(new_enum); + + libnbc_ireduce_algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_ireduce_algorithms", ireduce_algorithms, &new_enum); + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "ireduce_algorithm", + "Which ireduce algorithm is used: 0 ignore, 1 chain, 2 binomial, 3 rabenseifner", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, + &libnbc_ireduce_algorithm); + OBJ_RELEASE(new_enum); + + libnbc_iscan_algorithm = 0; + (void) mca_base_var_enum_create("coll_libnbc_iscan_algorithms", iscan_algorithms, &new_enum); + mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version, + "iscan_algorithm", + "Which iscan algorithm is used: 0 ignore, 1 linear, 2 recursive_doubling", + MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, + &libnbc_iscan_algorithm); + OBJ_RELEASE(new_enum); + return OMPI_SUCCESS; } - - /* * Initial query function that is invoked during MPI_INIT, allowing * this component to disqualify itself if it doesn't support the diff --git a/ompi/mca/coll/libnbc/libdict/dict.h b/ompi/mca/coll/libnbc/libdict/dict.h index 4680b2bc2c..61f213ebaf 100644 --- a/ompi/mca/coll/libnbc/libdict/dict.h +++ b/ompi/mca/coll/libnbc/libdict/dict.h @@ -62,7 +62,6 @@ struct dict { int (*_insert) __P((void *obj, void *k, void *d, int ow)); int (*_probe) __P((void *obj, void *key, void **dat)); void *(*_search) __P((void *obj, const void *k)); - const void *(*_csearch) __P((const void *obj, const void *k)); int (*_remove) __P((void *obj, const void *key, int del)); void (*_walk) __P((void *obj, dict_vis_func func)); unsigned (*_count) __P((const void *obj)); @@ -75,7 +74,6 @@ struct dict { #define dict_insert(dct,k,d,o) (dct)->_insert((dct)->_object, (k), (d), (o)) #define dict_probe(dct,k,d) (dct)->_probe((dct)->_object, (k), (d)) #define dict_search(dct,k) (dct)->_search((dct)->_object, (k)) -#define dict_csearch(dct,k) (dct)->_csearch((dct)->_object, (k)) #define dict_remove(dct,k,del) (dct)->_remove((dct)->_object, (k), (del)) #define dict_walk(dct,f) (dct)->_walk((dct)->_object, (f)) #define dict_count(dct) (dct)->_count((dct)->_object) diff --git a/ompi/mca/coll/libnbc/libdict/dict_private.h b/ompi/mca/coll/libnbc/libdict/dict_private.h index da2b6dbdea..3635f58700 100644 --- a/ompi/mca/coll/libnbc/libdict/dict_private.h +++ b/ompi/mca/coll/libnbc/libdict/dict_private.h @@ -15,7 +15,6 @@ typedef int (*insert_func) __P((void *, void *k, void *d, int o)); typedef int (*probe_func) __P((void *, void *k, void **d)); typedef void *(*search_func) __P((void *, const void *k)); -typedef const void *(*csearch_func) __P((const void *, const void *k)); typedef int (*remove_func) __P((void *, const void *k, int d)); typedef void (*walk_func) __P((void *, dict_vis_func visit)); typedef unsigned (*count_func) __P((const void *)); diff --git a/ompi/mca/coll/libnbc/libdict/hb_tree.c b/ompi/mca/coll/libnbc/libdict/hb_tree.c index c3837ed3cb..4b454c4457 100644 --- a/ompi/mca/coll/libnbc/libdict/hb_tree.c +++ b/ompi/mca/coll/libnbc/libdict/hb_tree.c @@ -90,7 +90,6 @@ hb_dict_new(dict_cmp_func key_cmp, dict_del_func key_del, dct->_insert = (insert_func)hb_tree_insert; dct->_probe = (probe_func)hb_tree_probe; dct->_search = (search_func)hb_tree_search; - dct->_csearch = (csearch_func)hb_tree_csearch; dct->_remove = (remove_func)hb_tree_remove; dct->_empty = (empty_func)hb_tree_empty; dct->_walk = (walk_func)hb_tree_walk; @@ -170,12 +169,6 @@ hb_tree_search(hb_tree *tree, const void *key) return NULL; } -const void * -hb_tree_csearch(const hb_tree *tree, const void *key) -{ - return hb_tree_csearch((hb_tree *)tree, key); -} - int hb_tree_insert(hb_tree *tree, void *key, void *dat, int overwrite) { diff --git a/ompi/mca/coll/libnbc/libdict/hb_tree.h b/ompi/mca/coll/libnbc/libdict/hb_tree.h index 2de8af6d19..8be94e1453 100644 --- a/ompi/mca/coll/libnbc/libdict/hb_tree.h +++ b/ompi/mca/coll/libnbc/libdict/hb_tree.h @@ -26,7 +26,6 @@ void hb_tree_destroy __P((hb_tree *tree, int del)); int hb_tree_insert __P((hb_tree *tree, void *key, void *dat, int overwrite)); int hb_tree_probe __P((hb_tree *tree, void *key, void **dat)); void *hb_tree_search __P((hb_tree *tree, const void *key)); -const void *hb_tree_csearch __P((const hb_tree *tree, const void *key)); int hb_tree_remove __P((hb_tree *tree, const void *key, int del)); void hb_tree_empty __P((hb_tree *tree, int del)); void hb_tree_walk __P((hb_tree *tree, dict_vis_func visit)); diff --git a/ompi/mca/coll/libnbc/nbc_iallgather.c b/ompi/mca/coll/libnbc/nbc_iallgather.c index e7a9104fba..29ba7a6a9c 100644 --- a/ompi/mca/coll/libnbc/nbc_iallgather.c +++ b/ompi/mca/coll/libnbc/nbc_iallgather.c @@ -20,6 +20,15 @@ */ #include "nbc_internal.h" +static inline int allgather_sched_linear( + int rank, int comm_size, NBC_Schedule *schedule, const void *sendbuf, + int scount, struct ompi_datatype_t *sdtype, void *recvbuf, int rcount, + struct ompi_datatype_t *rdtype); +static inline int allgather_sched_recursivedoubling( + int rank, int comm_size, NBC_Schedule *schedule, const void *sbuf, + int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, + struct ompi_datatype_t *rdtype); + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, void *param) { @@ -40,10 +49,6 @@ int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, voi } #endif -/* simple linear MPI_Iallgather - * the algorithm uses p-1 rounds - * each node sends the packet it received last round (or has in round 0) to it's right neighbor (modulo p) - * each node receives from it's left (modulo p) neighbor */ static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) @@ -51,16 +56,31 @@ static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype s int rank, p, res; MPI_Aint rcvext; NBC_Schedule *schedule; - char *rbuf, *sbuf, inplace; + char *rbuf, inplace; #ifdef NBC_CACHE_SCHEDULE NBC_Allgather_args *args, *found, search; #endif + enum { NBC_ALLGATHER_LINEAR, NBC_ALLGATHER_RDBL} alg; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); + int is_commsize_pow2 = !(p & (p - 1)); + + if (libnbc_iallgather_algorithm == 0) { + alg = NBC_ALLGATHER_LINEAR; + } else { + /* user forced dynamic decision */ + if (libnbc_iallgather_algorithm == 1) { + alg = NBC_ALLGATHER_LINEAR; + } else if (libnbc_iallgather_algorithm == 2 && is_commsize_pow2) { + alg = NBC_ALLGATHER_RDBL; + } else { + alg = NBC_ALLGATHER_LINEAR; + } + } res = ompi_datatype_type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { @@ -98,36 +118,32 @@ static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype s return OMPI_ERR_OUT_OF_RESOURCE; } - sbuf = (char *)recvbuf + rank * recvcount * rcvext; - - if (persistent && !inplace) { /* for nonblocking, data has been copied already */ + if (persistent && !inplace) { + /* for nonblocking, data has been copied already */ /* copy my data to receive buffer (= send buffer of NBC_Sched_send) */ - res = NBC_Sched_copy ((void *)sendbuf, false, sendcount, sendtype, - sbuf, false, recvcount, recvtype, schedule, true); + rbuf = (char *)recvbuf + rank * recvcount * rcvext; + res = NBC_Sched_copy((void *)sendbuf, false, sendcount, sendtype, + rbuf, false, recvcount, recvtype, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } - /* do p-1 rounds */ - for(int r = 0 ; r < p ; ++r) { - if(r != rank) { - /* recv from rank r */ - rbuf = (char *)recvbuf + r * recvcount * rcvext; - res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, r, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - return res; - } + switch (alg) { + case NBC_ALLGATHER_LINEAR: + res = allgather_sched_linear(rank, p, schedule, sendbuf, sendcount, sendtype, + recvbuf, recvcount, recvtype); + break; + case NBC_ALLGATHER_RDBL: + res = allgather_sched_recursivedoubling(rank, p, schedule, sendbuf, sendcount, + sendtype, recvbuf, recvcount, recvtype); + break; + } - /* send to rank r - not from the sendbuf to optimize MPI_IN_PLACE */ - res = NBC_Sched_send (sbuf, false, recvcount, recvtype, r, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - return res; - } - } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + OBJ_RELEASE(schedule); + return res; } res = NBC_Sched_commit(schedule); @@ -270,6 +286,109 @@ int ompi_coll_libnbc_iallgather_inter(const void* sendbuf, int sendcount, MPI_Da return OMPI_SUCCESS; } +/* + * allgather_sched_linear + * + * Description: an implementation of Iallgather using linear algorithm + * + * Time: O(comm_size) + * Schedule length (rounds): O(comm_size) + */ +static inline int allgather_sched_linear( + int rank, int comm_size, NBC_Schedule *schedule, const void *sendbuf, + int scount, struct ompi_datatype_t *sdtype, void *recvbuf, int rcount, + struct ompi_datatype_t *rdtype) +{ + int res = OMPI_SUCCESS; + ptrdiff_t rlb, rext; + + res = ompi_datatype_get_extent(rdtype, &rlb, &rext); + char *sbuf = (char *)recvbuf + rank * rcount * rext; + + for (int remote = 0; remote < comm_size ; ++remote) { + if (remote != rank) { + /* Recv from rank remote */ + char *rbuf = (char *)recvbuf + remote * rcount * rext; + res = NBC_Sched_recv(rbuf, false, rcount, rdtype, remote, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + /* Send to rank remote - not from the sendbuf to optimize MPI_IN_PLACE */ + res = NBC_Sched_send(sbuf, false, rcount, rdtype, remote, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + } + +cleanup_and_return: + return res; +} + +/* + * allgather_sched_recursivedoubling + * + * Description: an implementation of Iallgather using recursive doubling algorithm + * Limitation: power-of-two number of processes only + * Time: O(log(comm_size)) + * Schedule length (rounds): O(log(comm_size)) + * Memory: no additional memory requirements beyond user-supplied buffers. + * + * Example on 4 nodes: + * Initialization: everyone has its own buffer at location rank in rbuf + * # 0 1 2 3 + * [0] [ ] [ ] [ ] + * [ ] [1] [ ] [ ] + * [ ] [ ] [2] [ ] + * [ ] [ ] [ ] [3] + * Step 0: exchange data with (rank ^ 2^0) + * # 0 1 2 3 + * [0] [0] [ ] [ ] + * [1] [1] [ ] [ ] + * [ ] [ ] [2] [2] + * [ ] [ ] [3] [3] + * Step 1: exchange data with (rank ^ 2^1) (if you can) + * # 0 1 2 3 + * [0] [0] [0] [0] + * [1] [1] [1] [1] + * [2] [2] [2] [2] + * [3] [3] [3] [3] + * + */ +static inline int allgather_sched_recursivedoubling( + int rank, int comm_size, NBC_Schedule *schedule, const void *sbuf, + int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, + struct ompi_datatype_t *rdtype) +{ + int res = OMPI_SUCCESS; + ptrdiff_t rlb, rext; + char *tmpsend = NULL, *tmprecv = NULL; + + res = ompi_datatype_get_extent(rdtype, &rlb, &rext); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + int sendblocklocation = rank; + for (int distance = 1; distance < comm_size; distance <<= 1) { + int remote = rank ^ distance; + + tmpsend = (char *)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext; + if (rank < remote) { + tmprecv = (char *)rbuf + (ptrdiff_t)(sendblocklocation + distance) * (ptrdiff_t)rcount * rext; + } else { + tmprecv = (char *)rbuf + (ptrdiff_t)(sendblocklocation - distance) * (ptrdiff_t)rcount * rext; + sendblocklocation -= distance; + } + + res = NBC_Sched_send(tmpsend, false, (ptrdiff_t)distance * (ptrdiff_t)rcount, + rdtype, remote, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + res = NBC_Sched_recv(tmprecv, false, (ptrdiff_t)distance * (ptrdiff_t)rcount, + rdtype, remote, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + +cleanup_and_return: + return res; +} + int ompi_coll_libnbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module) { diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c index 57aa0d77e0..b8e9f27cbd 100644 --- a/ompi/mca/coll/libnbc/nbc_iallreduce.c +++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c @@ -22,17 +22,25 @@ #include "ompi/communicator/communicator.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/op/op.h" +#include "opal/util/bit_ops.h" #include static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, const void *sendbuf, void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf); +static inline int allred_sched_recursivedoubling(int rank, int p, const void *sendbuf, void *recvbuf, + int count, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op, + char inplace, NBC_Schedule *schedule, void *tmpbuf); static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, void *tmpbuf); static inline int allred_sched_linear(int rank, int p, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op, int ext, int size, NBC_Schedule *schedule, void *tmpbuf); +static inline int allred_sched_redscat_allgather( + int rank, int comm_size, int count, MPI_Datatype datatype, ptrdiff_t gap, + const void *sbuf, void *rbuf, MPI_Op op, char inplace, + NBC_Schedule *schedule, void *tmpbuf, struct ompi_communicator_t *comm); #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ @@ -64,7 +72,7 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI #ifdef NBC_CACHE_SCHEDULE NBC_Allreduce_args *args, *found, search; #endif - enum { NBC_ARED_BINOMIAL, NBC_ARED_RING } alg; + enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER, NBC_ARED_RDBL } alg; char inplace; void *tmpbuf = NULL; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; @@ -105,12 +113,27 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI } /* algorithm selection */ - if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) { - alg = NBC_ARED_BINOMIAL; + int nprocs_pof2 = opal_next_poweroftwo(p) >> 1; + if (libnbc_iallreduce_algorithm == 0) { + if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) { + alg = NBC_ARED_BINOMIAL; + } else if (count >= nprocs_pof2 && ompi_op_is_commute(op)) { + alg = NBC_ARED_REDSCAT_ALLGATHER; + } else { + alg = NBC_ARED_RING; + } } else { - alg = NBC_ARED_RING; + if (libnbc_iallreduce_algorithm == 1) + alg = NBC_ARED_RING; + else if (libnbc_iallreduce_algorithm == 2) + alg = NBC_ARED_BINOMIAL; + else if (libnbc_iallreduce_algorithm == 3 && count >= nprocs_pof2 && ompi_op_is_commute(op)) + alg = NBC_ARED_REDSCAT_ALLGATHER; + else if (libnbc_iallreduce_algorithm == 4) + alg = NBC_ARED_RDBL; + else + alg = NBC_ARED_RING; } - #ifdef NBC_CACHE_SCHEDULE /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; @@ -135,9 +158,15 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI case NBC_ARED_BINOMIAL: res = allred_sched_diss(rank, p, count, datatype, gap, sendbuf, recvbuf, op, inplace, schedule, tmpbuf); break; + case NBC_ARED_REDSCAT_ALLGATHER: + res = allred_sched_redscat_allgather(rank, p, count, datatype, gap, sendbuf, recvbuf, op, inplace, schedule, tmpbuf, comm); + break; case NBC_ARED_RING: res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, tmpbuf); break; + case NBC_ARED_RDBL: + res = allred_sched_recursivedoubling(rank, p, sendbuf, recvbuf, count, datatype, gap, op, inplace, schedule, tmpbuf); + break; } } @@ -449,6 +478,161 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat return OMPI_SUCCESS; } +/* + * allred_sched_recursivedoubling + * + * Function: Recursive doubling algorithm for iallreduce operation + * + * Description: Implements recursive doubling algorithm for iallreduce. + * The algorithm preserves order of operations so it can + * be used both by commutative and non-commutative operations. + * Schedule length: O(\log(p)) + * Memory requirements: + * Each process requires a temporary buffer: count * typesize = O(count) + * + * Example on 7 nodes: + * Initial state + * # 0 1 2 3 4 5 6 + * [0] [1] [2] [3] [4] [5] [6] + * Initial adjustment step for non-power of two nodes. + * old rank 1 3 5 6 + * new rank 0 1 2 3 + * [0+1] [2+3] [4+5] [6] + * Step 1 + * old rank 1 3 5 6 + * new rank 0 1 2 3 + * [0+1+] [0+1+] [4+5+] [4+5+] + * [2+3+] [2+3+] [6 ] [6 ] + * Step 2 + * old rank 1 3 5 6 + * new rank 0 1 2 3 + * [0+1+] [0+1+] [0+1+] [0+1+] + * [2+3+] [2+3+] [2+3+] [2+3+] + * [4+5+] [4+5+] [4+5+] [4+5+] + * [6 ] [6 ] [6 ] [6 ] + * Final adjustment step for non-power of two nodes + * # 0 1 2 3 4 5 6 + * [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] + * [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] + * [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] + * [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] + * + */ +static inline int allred_sched_recursivedoubling(int rank, int p, const void *sendbuf, void *recvbuf, + int count, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op, + char inplace, NBC_Schedule *schedule, void *tmpbuf) +{ + int res, pof2, nprocs_rem, vrank; + char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL; + + tmpsend = (char*) tmpbuf - gap; + tmprecv = (char*) recvbuf; + + if (inplace) { + res = NBC_Sched_copy(recvbuf, false, count, datatype, + tmpsend, false, count, datatype, schedule, true); + } else { + res = NBC_Sched_copy((void *)sendbuf, false, count, datatype, + tmpsend, false, count, datatype, schedule, true); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + + /* Get nearest power of two less than or equal to comm size */ + pof2 = opal_next_poweroftwo(p) >> 1; + + /* Handle non-power-of-two case: + - Even ranks less than 2 * nprocs_rem send their data to (rank + 1), and + sets new rank to -1. + - Odd ranks less than 2 * nprocs_rem receive data from (rank - 1), + apply appropriate operation, and set new rank to rank/2 + - Everyone else sets rank to rank - nprocs_rem + */ + nprocs_rem = p - pof2; + if (rank < 2 * nprocs_rem) { + if (0 == rank % 2) { /* Even */ + res = NBC_Sched_send(tmpsend, false, count, datatype, rank + 1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + vrank = -1; + } else { /* Odd */ + res = NBC_Sched_recv(tmprecv, false, count, datatype, rank - 1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + + /* tmpsend = tmprecv (op) tmpsend */ + res = NBC_Sched_op(tmprecv, false, tmpsend, false, count, datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + + vrank = rank >> 1; + } + } else { + vrank = rank - nprocs_rem; + } + + /* Communication/Computation loop + - Exchange message with remote node. + - Perform appropriate operation taking in account order of operations: + result = value (op) result + */ + if (0 <= vrank) { + for (int distance = 1; distance < pof2; distance <<= 1) { + int remote = vrank ^ distance; + + /* Find real rank of remote node */ + if (remote < nprocs_rem) { + remote = remote * 2 + 1; + } else { + remote += nprocs_rem; + } + + /* Exchange the data */ + res = NBC_Sched_send(tmpsend, false, count, datatype, remote, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + + res = NBC_Sched_recv(tmprecv, false, count, datatype, remote, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + + /* Apply operation */ + if (rank < remote) { + /* tmprecv = tmpsend (op) tmprecv */ + res = NBC_Sched_op(tmpsend, false, tmprecv, false, + count, datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + + /* Swap tmpsend and tmprecv buffers */ + tmpswap = tmprecv; tmprecv = tmpsend; tmpsend = tmpswap; + } else { + /* tmpsend = tmprecv (op) tmpsend */ + res = NBC_Sched_op(tmprecv, false, tmpsend, false, + count, datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + } + } + } + + /* Handle non-power-of-two case: + - Even ranks less than 2 * nprocs_rem receive result from (rank + 1) + - Odd ranks less than 2 * nprocs_rem send result from tmpsend to (rank - 1) + */ + if (rank < 2 * nprocs_rem) { + if (0 == rank % 2) { /* Even */ + res = NBC_Sched_recv(recvbuf, false, count, datatype, rank + 1, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + tmpsend = (char *)recvbuf; + } else { /* Odd */ + res = NBC_Sched_send(tmpsend, false, count, datatype, rank - 1, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + } + } + + /* Copy result back into recvbuf */ + if (tmpsend != recvbuf) { + res = NBC_Sched_copy(tmpsend, false, count, datatype, + recvbuf, false, count, datatype, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + } + + return OMPI_SUCCESS; +} + static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, void *tmpbuf) { int segsize, *segsizes, *segoffsets; /* segment sizes and offsets per segment (number of segments == number of nodes */ @@ -735,6 +919,271 @@ static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf, return OMPI_SUCCESS; } +/* + * allred_sched_redscat_allgather: + * + * Description: an implementation of Rabenseifner's Allreduce algorithm [1, 2]. + * [1] Rajeev Thakur, Rolf Rabenseifner and William Gropp. + * Optimization of Collective Communication Operations in MPICH // + * The Int. Journal of High Performance Computing Applications. Vol 19, + * Issue 1, pp. 49--66. + * [2] http://www.hlrs.de/mpi/myreduce.html. + * + * This algorithm is a combination of a reduce-scatter implemented with + * recursive vector halving and recursive distance doubling, followed either + * by an allgather implemented with recursive doubling. + * + * Step 1. If the number of processes is not a power of two, reduce it to + * the nearest lower power of two (p' = 2^{\floor{\log_2 p}}) + * by removing r = p - p' extra processes as follows. In the first 2r processes + * (ranks 0 to 2r - 1), all the even ranks send the second half of the input + * vector to their right neighbor (rank + 1), and all the odd ranks send + * the first half of the input vector to their left neighbor (rank - 1). + * The even ranks compute the reduction on the first half of the vector and + * the odd ranks compute the reduction on the second half. The odd ranks then + * send the result to their left neighbors (the even ranks). As a result, + * the even ranks among the first 2r processes now contain the reduction with + * the input vector on their right neighbors (the odd ranks). These odd ranks + * do not participate in the rest of the algorithm, which leaves behind + * a power-of-two number of processes. The first r even-ranked processes and + * the last p - 2r processes are now renumbered from 0 to p' - 1. + * + * Step 2. The remaining processes now perform a reduce-scatter by using + * recursive vector halving and recursive distance doubling. The even-ranked + * processes send the second half of their buffer to rank + 1 and the odd-ranked + * processes send the first half of their buffer to rank - 1. All processes + * then compute the reduction between the local buffer and the received buffer. + * In the next log_2(p') - 1 steps, the buffers are recursively halved, and the + * distance is doubled. At the end, each of the p' processes has 1 / p' of the + * total reduction result. + * + * Step 3. An allgather is performed by using recursive vector doubling and + * distance halving. All exchanges are executed in reverse order relative + * to recursive doubling on previous step. If the number of processes is not + * a power of two, the total result vector must be sent to the r processes + * that were removed in the first step. + * + * Limitations: + * count >= 2^{\floor{\log_2 p}} + * commutative operations only + * intra-communicators only + * + * Memory requirements (per process): + * count * typesize + 4 * \log_2(p) * sizeof(int) = O(count) + * + * Schedule length (rounds): O(\log(p)) + */ +static inline int allred_sched_redscat_allgather( + int rank, int comm_size, int count, MPI_Datatype datatype, ptrdiff_t gap, + const void *sbuf, void *rbuf, MPI_Op op, char inplace, + NBC_Schedule *schedule, void *tmpbuf, struct ompi_communicator_t *comm) +{ + int res = OMPI_SUCCESS; + int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL; + /* Find nearest power-of-two less than or equal to comm_size */ + int nsteps = opal_hibit(comm_size, comm->c_cube_dim + 1); /* ilog2(comm_size) */ + int nprocs_pof2 = 1 << nsteps; /* flp2(comm_size) */ + if (!inplace) { + res = NBC_Sched_copy((char *)sbuf, false, count, datatype, + rbuf, false, count, datatype, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + char *tmp_buf = (char *)tmpbuf - gap; + ptrdiff_t lb, extent; + ompi_datatype_get_extent(datatype, &lb, &extent); + /* + * Step 1. Reduce the number of processes to the nearest lower power of two + * p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes. + * 1. In the first 2r processes (ranks 0 to 2r - 1), all the even ranks send + * the second half of the input vector to their right neighbor (rank + 1) + * and all the odd ranks send the first half of the input vector to their + * left neighbor (rank - 1). + * 2. All 2r processes compute the reduction on their half. + * 3. The odd ranks then send the result to their left neighbors + * (the even ranks). + * + * The even ranks (0 to 2r - 1) now contain the reduction with the input + * vector on their right neighbors (the odd ranks). The first r even + * processes and the p - 2r last processes are renumbered from + * 0 to 2^{\floor{\log_2 p}} - 1. + */ + int vrank, step, wsize; + int nprocs_rem = comm_size - nprocs_pof2; + if (rank < 2 * nprocs_rem) { + int count_lhalf = count / 2; + int count_rhalf = count - count_lhalf; + if (rank % 2 != 0) { + /* + * Odd process -- exchange with rank - 1 + * Send the left half of the input vector to the left neighbor, + * Recv the right half of the input vector from the left neighbor + */ + res = NBC_Sched_send(rbuf, false, count_lhalf, datatype, rank - 1, + schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + res = NBC_Sched_recv(tmp_buf + (ptrdiff_t)count_lhalf * extent, + false, count_rhalf, datatype, rank - 1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + res = NBC_Sched_op(tmp_buf + (ptrdiff_t)count_lhalf * extent, + false, (char *)rbuf + (ptrdiff_t)count_lhalf * extent, + false, count_rhalf, datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + /* Send the right half to the left neighbor */ + res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)count_lhalf * extent, + false, count_rhalf, datatype, rank - 1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + /* This process does not participate in recursive doubling phase */ + vrank = -1; + } else { + /* + * Even process -- exchange with rank + 1 + * Send the right half of the input vector to the right neighbor, + * Recv the left half of the input vector from the right neighbor + */ + res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)count_lhalf * extent, + false, count_rhalf, datatype, rank + 1, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + res = NBC_Sched_recv(tmp_buf, false, count_lhalf, datatype, rank + 1, + schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + res = NBC_Sched_op(tmp_buf, false, rbuf, false, count_lhalf, + datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + /* Recv the right half from the right neighbor */ + res = NBC_Sched_recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent, + false, count_rhalf, datatype, rank + 1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + vrank = rank / 2; + } + } else { /* rank >= 2 * nprocs_rem */ + vrank = rank - nprocs_rem; + } + /* + * Step 2. Reduce-scatter implemented with recursive vector halving and + * recursive distance doubling. We have p' = 2^{\floor{\log_2 p}} + * power-of-two number of processes with new ranks (vrank) and result in rbuf. + * + * The even-ranked processes send the right half of their buffer to rank + 1 + * and the odd-ranked processes send the left half of their buffer to + * rank - 1. All processes then compute the reduction between the local + * buffer and the received buffer. In the next \log_2(p') - 1 steps, the + * buffers are recursively halved, and the distance is doubled. At the end, + * each of the p' processes has 1 / p' of the total reduction result. + */ + rindex = malloc(sizeof(*rindex) * nsteps); + sindex = malloc(sizeof(*sindex) * nsteps); + rcount = malloc(sizeof(*rcount) * nsteps); + scount = malloc(sizeof(*scount) * nsteps); + if (NULL == rindex || NULL == sindex || NULL == rcount || NULL == scount) { + res = OMPI_ERR_OUT_OF_RESOURCE; + goto cleanup_and_return; + } + if (vrank != -1) { + step = 0; + wsize = count; + sindex[0] = rindex[0] = 0; + for (int mask = 1; mask < nprocs_pof2; mask <<= 1) { + /* + * On each iteration: rindex[step] = sindex[step] -- begining of the + * current window. Length of the current window is storded in wsize. + */ + int vdest = vrank ^ mask; + /* Translate vdest virtual rank to real rank */ + int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; + if (rank < dest) { + /* + * Recv into the left half of the current window, send the right + * half of the window to the peer (perform reduce on the left + * half of the current window) + */ + rcount[step] = wsize / 2; + scount[step] = wsize - rcount[step]; + sindex[step] = rindex[step] + rcount[step]; + } else { + /* + * Recv into the right half of the current window, send the left + * half of the window to the peer (perform reduce on the right + * half of the current window) + */ + scount[step] = wsize / 2; + rcount[step] = wsize - scount[step]; + rindex[step] = sindex[step] + scount[step]; + } + /* Send part of data from the rbuf, recv into the tmp_buf */ + res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)sindex[step] * extent, + false, scount[step], datatype, dest, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + res = NBC_Sched_recv((char *)tmp_buf + (ptrdiff_t)rindex[step] * extent, + false, rcount[step], datatype, dest, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + /* Local reduce: rbuf[] = tmp_buf[] rbuf[] */ + res = NBC_Sched_op((char *)tmp_buf + (ptrdiff_t)rindex[step] * extent, + false, (char *)rbuf + (ptrdiff_t)rindex[step] * extent, + false, rcount[step], datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + /* Move the current window to the received message */ + if (step + 1 < nsteps) { + rindex[step + 1] = rindex[step]; + sindex[step + 1] = rindex[step]; + wsize = rcount[step]; + step++; + } + } + /* + * Assertion: each process has 1 / p' of the total reduction result: + * rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...]. + */ + /* + * Step 3. Allgather by the recursive doubling algorithm. + * Each process has 1 / p' of the total reduction result: + * rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...]. + * All exchanges are executed in reverse order relative + * to recursive doubling (previous step). + */ + step = nsteps - 1; + for (int mask = nprocs_pof2 >> 1; mask > 0; mask >>= 1) { + int vdest = vrank ^ mask; + /* Translate vdest virtual rank to real rank */ + int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; + /* + * Send rcount[step] elements from rbuf[rindex[step]...] + * Recv scount[step] elements to rbuf[sindex[step]...] + */ + res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)rindex[step] * extent, + false, rcount[step], datatype, dest, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + res = NBC_Sched_recv((char *)rbuf + (ptrdiff_t)sindex[step] * extent, + false, scount[step], datatype, dest, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + step--; + } + } + /* + * Step 4. Send total result to excluded odd ranks. + */ + if (rank < 2 * nprocs_rem) { + if (rank % 2 != 0) { + /* Odd process -- recv result from rank - 1 */ + res = NBC_Sched_recv(rbuf, false, count, datatype, rank - 1, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } else { + /* Even process -- send result to rank + 1 */ + res = NBC_Sched_send(rbuf, false, count, datatype, rank + 1, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + } + cleanup_and_return: + if (NULL != rindex) + free(rindex); + if (NULL != sindex) + free(sindex); + if (NULL != rcount) + free(rcount); + if (NULL != scount) + free(scount); + return res; +} + int ompi_coll_libnbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module) { @@ -758,4 +1207,3 @@ int ompi_coll_libnbc_allreduce_inter_init(const void* sendbuf, void* recvbuf, in return OMPI_SUCCESS; } - diff --git a/ompi/mca/coll/libnbc/nbc_ibcast.c b/ompi/mca/coll/libnbc/nbc_ibcast.c index 3cd1ca7b26..cbd381328d 100644 --- a/ompi/mca/coll/libnbc/nbc_ibcast.c +++ b/ompi/mca/coll/libnbc/nbc_ibcast.c @@ -26,6 +26,8 @@ static inline int bcast_sched_linear(int rank, int p, int root, NBC_Schedule *sc MPI_Datatype datatype); static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype, int fragsize, size_t size); +static inline int bcast_sched_knomial(int rank, int comm_size, int root, NBC_Schedule *schedule, void *buf, + int count, MPI_Datatype datatype, int knomial_radix); #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ @@ -55,7 +57,7 @@ static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int ro #ifdef NBC_CACHE_SCHEDULE NBC_Bcast_args *args, *found, search; #endif - enum { NBC_BCAST_LINEAR, NBC_BCAST_BINOMIAL, NBC_BCAST_CHAIN } alg; + enum { NBC_BCAST_LINEAR, NBC_BCAST_BINOMIAL, NBC_BCAST_CHAIN, NBC_BCAST_KNOMIAL } alg; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); @@ -73,25 +75,40 @@ static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int ro segsize = 16384; /* algorithm selection */ - if( libnbc_ibcast_skip_dt_decision ) { - if (p <= 4) { - alg = NBC_BCAST_LINEAR; + if (libnbc_ibcast_algorithm == 0) { + if( libnbc_ibcast_skip_dt_decision ) { + if (p <= 4) { + alg = NBC_BCAST_LINEAR; + } + else { + alg = NBC_BCAST_BINOMIAL; + } } else { - alg = NBC_BCAST_BINOMIAL; + if (p <= 4) { + alg = NBC_BCAST_LINEAR; + } else if (size * count < 65536) { + alg = NBC_BCAST_BINOMIAL; + } else if (size * count < 524288) { + alg = NBC_BCAST_CHAIN; + segsize = 8192; + } else { + alg = NBC_BCAST_CHAIN; + segsize = 32768; + } } - } - else { - if (p <= 4) { + } else { + /* user forced dynamic decision */ + if (libnbc_ibcast_algorithm == 1) { alg = NBC_BCAST_LINEAR; - } else if (size * count < 65536) { + } else if (libnbc_ibcast_algorithm == 2) { alg = NBC_BCAST_BINOMIAL; - } else if (size * count < 524288) { + } else if (libnbc_ibcast_algorithm == 3) { alg = NBC_BCAST_CHAIN; - segsize = 8192; + } else if (libnbc_ibcast_algorithm == 4 && libnbc_ibcast_knomial_radix > 1) { + alg = NBC_BCAST_KNOMIAL; } else { - alg = NBC_BCAST_CHAIN; - segsize = 32768; + alg = NBC_BCAST_LINEAR; } } @@ -119,6 +136,9 @@ static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int ro case NBC_BCAST_CHAIN: res = bcast_sched_chain(rank, p, root, schedule, buffer, count, datatype, segsize, size); break; + case NBC_BCAST_KNOMIAL: + res = bcast_sched_knomial(rank, p, root, schedule, buffer, count, datatype, libnbc_ibcast_knomial_radix); + break; } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -342,6 +362,52 @@ static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *sch return OMPI_SUCCESS; } +/* + * bcast_sched_knomial: + * + * Description: an implementation of Ibcast using k-nomial tree algorithm + * + * Time: (radix - 1)O(log_{radix}(comm_size)) + * Schedule length (rounds): O(log(comm_size)) + */ +static inline int bcast_sched_knomial( + int rank, int comm_size, int root, NBC_Schedule *schedule, void *buf, + int count, MPI_Datatype datatype, int knomial_radix) +{ + int res = OMPI_SUCCESS; + + /* Receive from parent */ + int vrank = (rank - root + comm_size) % comm_size; + int mask = 0x1; + while (mask < comm_size) { + if (vrank % (knomial_radix * mask)) { + int parent = vrank / (knomial_radix * mask) * (knomial_radix * mask); + parent = (parent + root) % comm_size; + res = NBC_Sched_recv(buf, false, count, datatype, parent, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + break; + } + mask *= knomial_radix; + } + mask /= knomial_radix; + + /* Send data to all children */ + while (mask > 0) { + for (int r = 1; r < knomial_radix; r++) { + int child = vrank + mask * r; + if (child < comm_size) { + child = (child + root) % comm_size; + res = NBC_Sched_send(buf, false, count, datatype, child, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + } + mask /= knomial_radix; + } + +cleanup_and_return: + return res; +} + static int nbc_bcast_inter_init(void *buffer, int count, MPI_Datatype datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { diff --git a/ompi/mca/coll/libnbc/nbc_iexscan.c b/ompi/mca/coll/libnbc/nbc_iexscan.c index 90a6b6bf27..547da001dc 100644 --- a/ompi/mca/coll/libnbc/nbc_iexscan.c +++ b/ompi/mca/coll/libnbc/nbc_iexscan.c @@ -18,8 +18,20 @@ * Author(s): Torsten Hoefler * */ +#include "opal/align.h" +#include "ompi/op/op.h" + #include "nbc_internal.h" +static inline int exscan_sched_linear( + int rank, int comm_size, const void *sendbuf, void *recvbuf, int count, + MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, + void *tmpbuf); +static inline int exscan_sched_recursivedoubling( + int rank, int comm_size, const void *sendbuf, void *recvbuf, + int count, MPI_Datatype datatype, MPI_Op op, char inplace, + NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2); + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { @@ -39,32 +51,44 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { } #endif -/* linear iexscan - * working principle: - * 1. each node (but node 0) receives from left neigbor - * 2. performs op - * 3. all but rank p-1 do sends to it's right neigbor and exits - * - */ static int nbc_exscan_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int rank, p, res; - ptrdiff_t gap, span; NBC_Schedule *schedule; -#ifdef NBC_CACHE_SCHEDULE - NBC_Scan_args *args, *found, search; -#endif char inplace; - void *tmpbuf = NULL; + void *tmpbuf = NULL, *tmpbuf1 = NULL, *tmpbuf2 = NULL; + enum { NBC_EXSCAN_LINEAR, NBC_EXSCAN_RDBL } alg; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; NBC_IN_PLACE(sendbuf, recvbuf, inplace); - rank = ompi_comm_rank (comm); - p = ompi_comm_size (comm); + rank = ompi_comm_rank(comm); + p = ompi_comm_size(comm); + + if (p < 2) { + return nbc_get_noop_request(persistent, request); + } + + span = opal_datatype_span(&datatype->super, count, &gap); + if (libnbc_iexscan_algorithm == 2) { + alg = NBC_EXSCAN_RDBL; + ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t); + tmpbuf = malloc(span_align + span); + if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } + tmpbuf1 = (void *)(-gap); + tmpbuf2 = (char *)(span_align) - gap; + } else { + alg = NBC_EXSCAN_LINEAR; + if (rank > 0) { + tmpbuf = malloc(span); + if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } + } + } #ifdef NBC_CACHE_SCHEDULE + NBC_Scan_args *args, *found, search; /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; search.recvbuf = recvbuf; @@ -74,84 +98,31 @@ static int nbc_exscan_init(const void* sendbuf, void* recvbuf, int count, MPI_Da found = (NBC_Scan_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_EXSCAN], &search); if (NULL == found) { #endif - schedule = OBJ_NEW(NBC_Schedule); - if (OPAL_UNLIKELY(NULL == schedule)) { - free(tmpbuf); - return OMPI_ERR_OUT_OF_RESOURCE; - } + schedule = OBJ_NEW(NBC_Schedule); + if (OPAL_UNLIKELY(NULL == schedule)) { + free(tmpbuf); + return OMPI_ERR_OUT_OF_RESOURCE; + } - if (rank != 0) { - span = opal_datatype_span(&datatype->super, count, &gap); - tmpbuf = malloc(span); - if (NULL == tmpbuf) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - if (inplace) { - res = NBC_Sched_copy(recvbuf, false, count, datatype, - (char *)tmpbuf-gap, false, count, datatype, schedule, false); - } else { - res = NBC_Sched_copy((void *)sendbuf, false, count, datatype, - (char *)tmpbuf-gap, false, count, datatype, schedule, false); - } - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } + if (alg == NBC_EXSCAN_LINEAR) { + res = exscan_sched_linear(rank, p, sendbuf, recvbuf, count, datatype, + op, inplace, schedule, tmpbuf); + } else { + res = exscan_sched_recursivedoubling(rank, p, sendbuf, recvbuf, count, + datatype, op, inplace, schedule, tmpbuf1, tmpbuf2); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + OBJ_RELEASE(schedule); + free(tmpbuf); + return res; + } - res = NBC_Sched_recv (recvbuf, false, count, datatype, rank-1, schedule, false); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } - - if (rank < p - 1) { - /* we have to wait until we have the data */ - res = NBC_Sched_barrier(schedule); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } - - res = NBC_Sched_op (recvbuf, false, (void *)(-gap), true, count, - datatype, op, schedule, true); - - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } - - /* send reduced data onward */ - res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } - } - } else if (p > 1) { - if (inplace) { - res = NBC_Sched_send (recvbuf, false, count, datatype, 1, schedule, false); - } else { - res = NBC_Sched_send (sendbuf, false, count, datatype, 1, schedule, false); - } - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } - } - - res = NBC_Sched_commit(schedule); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } + res = NBC_Sched_commit(schedule); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + OBJ_RELEASE(schedule); + free(tmpbuf); + return res; + } #ifdef NBC_CACHE_SCHEDULE /* save schedule to tree */ @@ -224,3 +195,168 @@ int ompi_coll_libnbc_exscan_init(const void* sendbuf, void* recvbuf, int count, return OMPI_SUCCESS; } + +/* + * exscan_sched_linear: + * + * Function: Linear algorithm for exclusive scan. + * Accepts: Same as MPI_Iexscan + * Returns: MPI_SUCCESS or error code + * + * Working principle: + * 1. Each process (but process 0) receives from left neighbor + * 2. Performs op + * 3. All but rank p - 1 do sends to it's right neighbor and exits + * + * Schedule length: O(1) + */ +static inline int exscan_sched_linear( + int rank, int comm_size, const void *sendbuf, void *recvbuf, int count, + MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, + void *tmpbuf) +{ + int res = OMPI_SUCCESS; + ptrdiff_t gap; + opal_datatype_span(&datatype->super, count, &gap); + + if (rank > 0) { + if (inplace) { + res = NBC_Sched_copy(recvbuf, false, count, datatype, + (char *)tmpbuf - gap, false, count, datatype, schedule, false); + } else { + res = NBC_Sched_copy((void *)sendbuf, false, count, datatype, + (char *)tmpbuf - gap, false, count, datatype, schedule, false); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + res = NBC_Sched_recv(recvbuf, false, count, datatype, rank - 1, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + if (rank < comm_size - 1) { + /* We have to wait until we have the data */ + res = NBC_Sched_barrier(schedule); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + res = NBC_Sched_op(recvbuf, false, (void *)(-gap), true, count, + datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + /* Send reduced data onward */ + res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + } else if (comm_size > 1) { + /* Process 0 */ + if (inplace) { + res = NBC_Sched_send(recvbuf, false, count, datatype, 1, schedule, false); + } else { + res = NBC_Sched_send(sendbuf, false, count, datatype, 1, schedule, false); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + +cleanup_and_return: + return res; +} + +/* + * exscan_sched_recursivedoubling: + * + * Function: Recursive doubling algorithm for exclusive scan. + * Accepts: Same as MPI_Iexscan + * Returns: MPI_SUCCESS or error code + * + * Description: Implements recursive doubling algorithm for MPI_Iexscan. + * The algorithm preserves order of operations so it can + * be used both by commutative and non-commutative operations. + * + * Example for 5 processes and commutative operation MPI_SUM: + * Process: 0 1 2 3 4 + * recvbuf: - - - - - + * psend: [0] [1] [2] [3] [4] + * + * Step 1: + * recvbuf: - [0] - [2] - + * psend: [1+0] [0+1] [3+2] [2+3] [4] + * + * Step 2: + * recvbuf: - [0] [1+0] [(0+1)+2] - + * psend: [(3+2)+(1+0)] [(2+3)+(0+1)] [(1+0)+(3+2)] [(1+0)+(2+3)] [4] + * + * Step 3: + * recvbuf: - [0] [1+0] [(0+1)+2] [(3+2)+(1+0)] + * psend: [4+((3+2)+(1+0))] [((3+2)+(1+0))+4] + * + * Time complexity (worst case): \ceil(\log_2(p))(2\alpha + 2m\beta + 2m\gamma) + * Memory requirements (per process): 2 * count * typesize = O(count) + * Limitations: intra-communicators only + * Schedule length: O(log(p)) + */ +static inline int exscan_sched_recursivedoubling( + int rank, int comm_size, const void *sendbuf, void *recvbuf, int count, + MPI_Datatype datatype, MPI_Op op, char inplace, + NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2) +{ + int res = OMPI_SUCCESS; + char *psend = (char *)tmpbuf1; + char *precv = (char *)tmpbuf2; + + if (!inplace) { + res = NBC_Sched_copy((char *)sendbuf, false, count, datatype, + psend, true, count, datatype, schedule, true); + } else { + res = NBC_Sched_copy((char *)recvbuf, false, count, datatype, + psend, true, count, datatype, schedule, true); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + int is_commute = ompi_op_is_commute(op); + int is_first_block = 1; + + for (int mask = 1; mask < comm_size; mask <<= 1) { + int remote = rank ^ mask; + if (remote < comm_size) { + res = NBC_Sched_send(psend, true, count, datatype, remote, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + res = NBC_Sched_recv(precv, true, count, datatype, remote, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + if (rank > remote) { + /* Assertion: rank > 0 and rbuf is valid */ + if (is_first_block) { + res = NBC_Sched_copy(precv, true, count, datatype, + recvbuf, false, count, datatype, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + is_first_block = 0; + } else { + /* Accumulate prefix reduction: recvbuf = precv recvbuf */ + res = NBC_Sched_op(precv, true, recvbuf, false, count, + datatype, op, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + /* Partial result: psend = precv psend */ + res = NBC_Sched_op(precv, true, psend, true, count, + datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } else { + if (is_commute) { + /* psend = precv psend */ + res = NBC_Sched_op(precv, true, psend, true, count, + datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } else { + /* precv = psend precv */ + res = NBC_Sched_op(psend, true, precv, true, count, + datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + char *tmp = psend; + psend = precv; + precv = tmp; + } + } + } + } + +cleanup_and_return: + return res; +} diff --git a/ompi/mca/coll/libnbc/nbc_ineighbor_allgather.c b/ompi/mca/coll/libnbc/nbc_ineighbor_allgather.c index e15ddf3326..ad0f6a128f 100644 --- a/ompi/mca/coll/libnbc/nbc_ineighbor_allgather.c +++ b/ompi/mca/coll/libnbc/nbc_ineighbor_allgather.c @@ -181,157 +181,6 @@ int ompi_coll_libnbc_ineighbor_allgather(const void *sbuf, int scount, MPI_Datat return OMPI_SUCCESS; } -/* better binomial bcast - * working principle: - * - each node gets a virtual rank vrank - * - the 'root' node get vrank 0 - * - node 0 gets the vrank of the 'root' - * - all other ranks stay identical (they do not matter) - * - * Algorithm: - * - each node with vrank > 2^r and vrank < 2^r+1 receives from node - * vrank - 2^r (vrank=1 receives from 0, vrank 0 receives never) - * - each node sends each round r to node vrank + 2^r - * - a node stops to send if 2^r > commsize - */ -#define RANK2VRANK(rank, vrank, root) \ -{ \ - vrank = rank; \ - if (rank == 0) vrank = root; \ - if (rank == root) vrank = 0; \ -} -#define VRANK2RANK(rank, vrank, root) \ -{ \ - rank = vrank; \ - if (vrank == 0) rank = root; \ - if (vrank == root) rank = 0; \ -} -static inline int bcast_sched_binomial(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype) { - int maxr, vrank, peer, res; - - maxr = (int)ceil((log((double)p)/LOG2)); - - RANK2VRANK(rank, vrank, root); - - /* receive from the right hosts */ - if (vrank != 0) { - for (int r = 0 ; r < maxr ; ++r) { - if ((vrank >= (1 << r)) && (vrank < (1 << (r + 1)))) { - VRANK2RANK(peer, vrank - (1 << r), root); - res = NBC_Sched_recv (buffer, false, count, datatype, peer, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - return res; - } - } - } - - res = NBC_Sched_barrier (schedule); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - return res; - } - } - - /* now send to the right hosts */ - for (int r = 0 ; r < maxr ; ++r) { - if (((vrank + (1 << r) < p) && (vrank < (1 << r))) || (vrank == 0)) { - VRANK2RANK(peer, vrank + (1 << r), root); - res = NBC_Sched_send (buffer, false, count, datatype, peer, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - return res; - } - } - } - - return OMPI_SUCCESS; -} - -/* simple linear MPI_Ibcast */ -static inline int bcast_sched_linear(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype) { - int res; - - /* send to all others */ - if(rank == root) { - for (int peer = 0 ; peer < p ; ++peer) { - if (peer != root) { - /* send msg to peer */ - res = NBC_Sched_send (buffer, false, count, datatype, peer, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - return res; - } - } - } - } else { - /* recv msg from root */ - res = NBC_Sched_recv (buffer, false, count, datatype, root, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - return res; - } - } - - return OMPI_SUCCESS; -} - -/* simple chained MPI_Ibcast */ -static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype, int fragsize, size_t size) { - int res, vrank, rpeer, speer, numfrag, fragcount, thiscount; - MPI_Aint ext; - char *buf; - - RANK2VRANK(rank, vrank, root); - VRANK2RANK(rpeer, vrank-1, root); - VRANK2RANK(speer, vrank+1, root); - res = ompi_datatype_type_extent(datatype, &ext); - if (MPI_SUCCESS != res) { - NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); - return res; - } - - if (count == 0) { - return OMPI_SUCCESS; - } - - numfrag = count * size/fragsize; - if ((count * size) % fragsize != 0) { - numfrag++; - } - - fragcount = count/numfrag; - - for (int fragnum = 0 ; fragnum < numfrag ; ++fragnum) { - buf = (char *) buffer + fragnum * fragcount * ext; - thiscount = fragcount; - if (fragnum == numfrag-1) { - /* last fragment may not be full */ - thiscount = count - fragcount * fragnum; - } - - /* root does not receive */ - if (vrank != 0) { - res = NBC_Sched_recv (buf, false, thiscount, datatype, rpeer, schedule, true); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - return res; - } - } - - /* last rank does not send */ - if (vrank != p-1) { - res = NBC_Sched_send (buf, false, thiscount, datatype, speer, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - return res; - } - - /* this barrier here seems awaward but isn't!!!! */ - if (vrank == 0) { - res = NBC_Sched_barrier (schedule); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - return res; - } - } - } - } - - return OMPI_SUCCESS; -} int ompi_coll_libnbc_neighbor_allgather_init(const void *sbuf, int scount, MPI_Datatype stype, void *rbuf, int rcount, MPI_Datatype rtype, struct ompi_communicator_t *comm, diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h index da9786dbb6..735beaa06e 100644 --- a/ompi/mca/coll/libnbc/nbc_internal.h +++ b/ompi/mca/coll/libnbc/nbc_internal.h @@ -516,6 +516,11 @@ static inline int NBC_Unpack(void *src, int srccount, MPI_Datatype srctype, void int res; ptrdiff_t ext, lb; + res = ompi_datatype_pack_external_size("external32", srccount, srctype, &size); + if (OMPI_SUCCESS != res) { + NBC_Error ("MPI Error in ompi_datatype_pack_external_size() (%i)", res); + return res; + } #if OPAL_CUDA_SUPPORT if(NBC_Type_intrinsic(srctype) && !(opal_cuda_check_bufs((char *)tgt, (char *)src))) { #else @@ -523,7 +528,6 @@ static inline int NBC_Unpack(void *src, int srccount, MPI_Datatype srctype, void #endif /* OPAL_CUDA_SUPPORT */ /* if we have the same types and they are contiguous (intrinsic * types are contiguous), we can just use a single memcpy */ - res = ompi_datatype_pack_external_size("external32", srccount, srctype, &size); res = ompi_datatype_get_extent (srctype, &lb, &ext); if (OMPI_SUCCESS != res) { NBC_Error ("MPI Error in MPI_Type_extent() (%i)", res); diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c index c222fa3a7f..da50f1eb27 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -19,7 +19,9 @@ * */ -#include "opal/include/opal/align.h" +#include "ompi_config.h" +#include "opal/align.h" +#include "opal/util/bit_ops.h" #include "ompi/op/op.h" #include "nbc_internal.h" @@ -31,6 +33,10 @@ static inline int red_sched_chain (int rank, int p, int root, const void *sendbu static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule); +static inline int red_sched_redscat_gather( + int rank, int comm_size, int root, const void *sbuf, void *rbuf, + char tmpredbuf, int count, MPI_Datatype datatype, MPI_Op op, char inplace, + NBC_Schedule *schedule, void *tmp_buf, struct ompi_communicator_t *comm); #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ @@ -63,7 +69,7 @@ static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Da char *redbuf=NULL, inplace; void *tmpbuf; char tmpredbuf = 0; - enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN } alg; + enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN, NBC_RED_REDSCAT_GATHER} alg; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; ptrdiff_t span, gap; @@ -98,22 +104,42 @@ static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Da span = opal_datatype_span(&datatype->super, count, &gap); /* algorithm selection */ - if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { - alg = NBC_RED_BINOMIAL; - if(rank == root) { - /* root reduces in receivebuffer */ - tmpbuf = malloc (span); + int nprocs_pof2 = opal_next_poweroftwo(p) >> 1; + if (libnbc_ireduce_algorithm == 0) { + if (ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) { + alg = NBC_RED_REDSCAT_GATHER; + } else if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { + alg = NBC_RED_BINOMIAL; + } else { + alg = NBC_RED_CHAIN; + } + } else { + if (libnbc_ireduce_algorithm == 1) { + alg = NBC_RED_CHAIN; + } else if (libnbc_ireduce_algorithm == 2) { + alg = NBC_RED_BINOMIAL; + } else if (libnbc_ireduce_algorithm == 3 && ompi_op_is_commute(op) && p > 2 && count >= nprocs_pof2) { + alg = NBC_RED_REDSCAT_GATHER; + } else { + alg = NBC_RED_CHAIN; + } + } + + /* allocate temporary buffers */ + if (alg == NBC_RED_REDSCAT_GATHER || alg == NBC_RED_BINOMIAL) { + if (rank == root) { + /* root reduces in receive buffer */ + tmpbuf = malloc(span); redbuf = recvbuf; } else { /* recvbuf may not be valid on non-root nodes */ ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t); - tmpbuf = malloc (span_align + span); - redbuf = (char*)span_align - gap; + tmpbuf = malloc(span_align + span); + redbuf = (char *)span_align - gap; tmpredbuf = 1; } } else { tmpbuf = malloc (span); - alg = NBC_RED_CHAIN; segsize = 16384/2; } @@ -151,6 +177,9 @@ static int nbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Da case NBC_RED_CHAIN: res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, tmpbuf, segsize); break; + case NBC_RED_REDSCAT_GATHER: + res = red_sched_redscat_gather(rank, p, root, sendbuf, redbuf, tmpredbuf, count, datatype, op, inplace, schedule, tmpbuf, comm); + break; } } @@ -560,6 +589,354 @@ static inline int red_sched_linear (int rank, int rsize, int root, const void *s return OMPI_SUCCESS; } +/* + * red_sched_redscat_gather: + * + * Description: an implementation of Rabenseifner's Reduce algorithm [1, 2]. + * [1] Rajeev Thakur, Rolf Rabenseifner and William Gropp. + * Optimization of Collective Communication Operations in MPICH // + * The Int. Journal of High Performance Computing Applications. Vol 19, + * Issue 1, pp. 49--66. + * [2] http://www.hlrs.de/mpi/myreduce.html. + * + * This algorithm is a combination of a reduce-scatter implemented with + * recursive vector halving and recursive distance doubling, followed either + * by a binomial tree gather. + * + * Step 1. If the number of processes is not a power of two, reduce it to + * the nearest lower power of two (p' = 2^{\floor{\log_2 p}}) + * by removing r = p - p' extra processes as follows. In the first 2r processes + * (ranks 0 to 2r - 1), all the even ranks send the second half of the input + * vector to their right neighbor (rank + 1), and all the odd ranks send + * the first half of the input vector to their left neighbor (rank - 1). + * The even ranks compute the reduction on the first half of the vector and + * the odd ranks compute the reduction on the second half. The odd ranks then + * send the result to their left neighbors (the even ranks). As a result, + * the even ranks among the first 2r processes now contain the reduction with + * the input vector on their right neighbors (the odd ranks). These odd ranks + * do not participate in the rest of the algorithm, which leaves behind + * a power-of-two number of processes. The first r even-ranked processes and + * the last p - 2r processes are now renumbered from 0 to p' - 1. + * + * Step 2. The remaining processes now perform a reduce-scatter by using + * recursive vector halving and recursive distance doubling. The even-ranked + * processes send the second half of their buffer to rank + 1 and the odd-ranked + * processes send the first half of their buffer to rank - 1. All processes + * then compute the reduction between the local buffer and the received buffer. + * In the next log_2(p') - 1 steps, the buffers are recursively halved, and the + * distance is doubled. At the end, each of the p' processes has 1 / p' of the + * total reduction result. + * + * Step 3. A binomial tree gather is performed by using recursive vector + * doubling and distance halving. In the non-power-of-two case, if the root + * happens to be one of those odd-ranked processes that would normally + * be removed in the first step, then the role of this process and process 0 + * are interchanged. + * + * Limitations: + * count >= 2^{\floor{\log_2 p}} + * commutative operations only + * intra-communicators only + * + * Memory requirements (per process): + * rank != root: 2 * count * typesize + 4 * \log_2(p) * sizeof(int) = O(count) + * rank == root: count * typesize + 4 * \log_2(p) * sizeof(int) = O(count) + * + * Schedule length (rounds): O(\log(p)) + * Recommendations: root = 0, otherwise it is required additional steps + * in the root process. + */ +static inline int red_sched_redscat_gather( + int rank, int comm_size, int root, const void *sbuf, void *rbuf, + char tmpredbuf, int count, MPI_Datatype datatype, MPI_Op op, char inplace, + NBC_Schedule *schedule, void *tmp_buf, struct ompi_communicator_t *comm) +{ + int res = OMPI_SUCCESS; + int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL; + + /* Find nearest power-of-two less than or equal to comm_size */ + int nsteps = opal_hibit(comm_size, comm->c_cube_dim + 1); /* ilog2(comm_size) */ + if (nsteps < 1) { + /* This case never happens (for comm_size < 2 other algorithms are used) */ + return OMPI_ERR_NOT_SUPPORTED; + } + int nprocs_pof2 = 1 << nsteps; /* flp2(comm_size) */ + + ptrdiff_t lb, extent; + ompi_datatype_get_extent(datatype, &lb, &extent); + + if ((rank != root) || !inplace) { + res = NBC_Sched_copy((char *)sbuf, false, count, datatype, + rbuf, tmpredbuf, count, datatype, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + + /* + * Step 1. Reduce the number of processes to the nearest lower power of two + * p' = 2^{\floor{\log_2 p}} by removing r = p - p' processes. + * 1. In the first 2r processes (ranks 0 to 2r - 1), all the even ranks send + * the second half of the input vector to their right neighbor (rank + 1) + * and all the odd ranks send the first half of the input vector to their + * left neighbor (rank - 1). + * 2. All 2r processes compute the reduction on their half. + * 3. The odd ranks then send the result to their left neighbors + * (the even ranks). + * + * The even ranks (0 to 2r - 1) now contain the reduction with the input + * vector on their right neighbors (the odd ranks). The first r even + * processes and the p - 2r last processes are renumbered from + * 0 to 2^{\floor{\log_2 p}} - 1. These odd ranks do not participate in the + * rest of the algorithm. + */ + + int vrank, step, wsize; + int nprocs_rem = comm_size - nprocs_pof2; + + if (rank < 2 * nprocs_rem) { + int count_lhalf = count / 2; + int count_rhalf = count - count_lhalf; + + if (rank % 2 != 0) { + /* + * Odd process -- exchange with rank - 1 + * Send the left half of the input vector to the left neighbor, + * Recv the right half of the input vector from the left neighbor + */ + res = NBC_Sched_send(rbuf, tmpredbuf, count_lhalf, datatype, rank - 1, + schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + res = NBC_Sched_recv((char *)tmp_buf + (ptrdiff_t)count_lhalf * extent, + false, count_rhalf, datatype, rank - 1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + res = NBC_Sched_op((char *)tmp_buf + (ptrdiff_t)count_lhalf * extent, + false, (char *)rbuf + (ptrdiff_t)count_lhalf * extent, + tmpredbuf, count_rhalf, datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + /* Send the right half to the left neighbor */ + res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)count_lhalf * extent, + tmpredbuf, count_rhalf, datatype, rank - 1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + /* This process does not participate in recursive doubling phase */ + vrank = -1; + + } else { + /* + * Even process -- exchange with rank + 1 + * Send the right half of the input vector to the right neighbor, + * Recv the left half of the input vector from the right neighbor + */ + res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)count_lhalf * extent, + tmpredbuf, count_rhalf, datatype, rank + 1, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + res = NBC_Sched_recv((char *)tmp_buf, false, count_lhalf, datatype, rank + 1, + schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + res = NBC_Sched_op(tmp_buf, false, rbuf, tmpredbuf, count_lhalf, + datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + /* Recv the right half from the right neighbor */ + res = NBC_Sched_recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent, + tmpredbuf, count_rhalf, datatype, rank + 1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + vrank = rank / 2; + } + } else { /* rank >= 2 * nprocs_rem */ + vrank = rank - nprocs_rem; + } + + /* + * Step 2. Reduce-scatter implemented with recursive vector halving and + * recursive distance doubling. We have p' = 2^{\floor{\log_2 p}} + * power-of-two number of processes with new ranks (vrank) and result in rbuf. + * + * The even-ranked processes send the right half of their buffer to rank + 1 + * and the odd-ranked processes send the left half of their buffer to + * rank - 1. All processes then compute the reduction between the local + * buffer and the received buffer. In the next \log_2(p') - 1 steps, the + * buffers are recursively halved, and the distance is doubled. At the end, + * each of the p' processes has 1 / p' of the total reduction result. + */ + + rindex = malloc(sizeof(*rindex) * nsteps); /* O(\log_2(p)) */ + sindex = malloc(sizeof(*sindex) * nsteps); + rcount = malloc(sizeof(*rcount) * nsteps); + scount = malloc(sizeof(*scount) * nsteps); + if (NULL == rindex || NULL == sindex || NULL == rcount || NULL == scount) { + res = OMPI_ERR_OUT_OF_RESOURCE; + goto cleanup_and_return; + } + + if (vrank != -1) { + step = 0; + wsize = count; + sindex[0] = rindex[0] = 0; + + for (int mask = 1; mask < nprocs_pof2; mask <<= 1) { + /* + * On each iteration: rindex[step] = sindex[step] -- begining of the + * current window. Length of the current window is storded in wsize. + */ + int vdest = vrank ^ mask; + /* Translate vdest virtual rank to real rank */ + int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; + + if (rank < dest) { + /* + * Recv into the left half of the current window, send the right + * half of the window to the peer (perform reduce on the left + * half of the current window) + */ + rcount[step] = wsize / 2; + scount[step] = wsize - rcount[step]; + sindex[step] = rindex[step] + rcount[step]; + } else { + /* + * Recv into the right half of the current window, send the left + * half of the window to the peer (perform reduce on the right + * half of the current window) + */ + scount[step] = wsize / 2; + rcount[step] = wsize - scount[step]; + rindex[step] = sindex[step] + scount[step]; + } + + /* Send part of data from the rbuf, recv into the tmp_buf */ + res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)sindex[step] * extent, + tmpredbuf, scount[step], datatype, dest, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + res = NBC_Sched_recv((char *)tmp_buf + (ptrdiff_t)rindex[step] * extent, + false, rcount[step], datatype, dest, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + /* Local reduce: rbuf[] = tmp_buf[] rbuf[] */ + res = NBC_Sched_op((char *)tmp_buf + (ptrdiff_t)rindex[step] * extent, + false, (char *)rbuf + (ptrdiff_t)rindex[step] * extent, + tmpredbuf, rcount[step], datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + /* Move the current window to the received message */ + if (step + 1 < nsteps) { + rindex[step + 1] = rindex[step]; + sindex[step + 1] = rindex[step]; + wsize = rcount[step]; + step++; + } + } + } + /* + * Assertion: each process has 1 / p' of the total reduction result: + * rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...]. + */ + + /* + * Setup the root process for gather operation. + * Case 1: root < 2r and root is odd -- root process was excluded on step 1 + * Recv data from process 0, vroot = 0, vrank = 0 + * Case 2: root < 2r and root is even: vroot = root / 2 + * Case 3: root >= 2r: vroot = root - r + */ + int vroot = 0; + if (root < 2 * nprocs_rem) { + if (root % 2 != 0) { + vroot = 0; + if (rank == root) { + /* + * Case 1: root < 2r and root is odd -- root process was + * excluded on step 1 (newrank == -1). + * Recv a data from the process 0. + */ + rindex[0] = 0; + step = 0, wsize = count; + for (int mask = 1; mask < nprocs_pof2; mask *= 2) { + rcount[step] = wsize / 2; + scount[step] = wsize - rcount[step]; + rindex[step] = 0; + sindex[step] = rcount[step]; + step++; + wsize /= 2; + } + + res = NBC_Sched_recv(rbuf, tmpredbuf, rcount[nsteps - 1], datatype, + 0, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + vrank = 0; + + } else if (vrank == 0) { + /* Send a data to the root */ + res = NBC_Sched_send(rbuf, tmpredbuf, rcount[nsteps - 1], datatype, + root, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + vrank = -1; + } + } else { + /* Case 2: root < 2r and a root is even: vroot = root / 2 */ + vroot = root / 2; + } + } else { + /* Case 3: root >= 2r: newroot = root - r */ + vroot = root - nprocs_rem; + } + + /* + * Step 3. Gather result at the vroot by the binomial tree algorithm. + * Each process has 1 / p' of the total reduction result: + * rcount[nsteps - 1] elements in the rbuf[rindex[nsteps - 1], ...]. + * All exchanges are executed in reverse order relative + * to recursive doubling (previous step). + */ + + if (vrank != -1) { + int vdest_tree, vroot_tree; + step = nsteps - 1; /* step = ilog2(p') - 1 */ + + for (int mask = nprocs_pof2 >> 1; mask > 0; mask >>= 1) { + int vdest = vrank ^ mask; + /* Translate vdest virtual rank to real rank */ + int dest = (vdest < nprocs_rem) ? vdest * 2 : vdest + nprocs_rem; + if ((vdest == 0) && (root < 2 * nprocs_rem) && (root % 2 != 0)) + dest = root; + + vdest_tree = vdest >> step; + vdest_tree <<= step; + vroot_tree = vroot >> step; + vroot_tree <<= step; + if (vdest_tree == vroot_tree) { + /* Send data from rbuf and exit */ + + res = NBC_Sched_send((char *)rbuf + (ptrdiff_t)rindex[step] * extent, + tmpredbuf, rcount[step], datatype, dest, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + break; + } else { + /* Recv and continue */ + res = NBC_Sched_recv((char *)rbuf + (ptrdiff_t)sindex[step] * extent, + tmpredbuf, scount[step], datatype, dest, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + step--; + } + } + + cleanup_and_return: + if (NULL != rindex) + free(rindex); + if (NULL != sindex) + free(sindex); + if (NULL != rcount) + free(rcount); + if (NULL != scount) + free(scount); + return res; +} + int ompi_coll_libnbc_reduce_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, struct ompi_communicator_t *comm, MPI_Info info, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module) { diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c index 230bcaa010..9e0ebb39b3 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c @@ -21,7 +21,7 @@ * Author(s): Torsten Hoefler * */ -#include "opal/include/opal/align.h" +#include "opal/align.h" #include "nbc_internal.h" diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c index 6dadd1eafa..54bee4f13d 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c @@ -19,7 +19,7 @@ * Author(s): Torsten Hoefler * */ -#include "opal/include/opal/align.h" +#include "opal/align.h" #include "nbc_internal.h" diff --git a/ompi/mca/coll/libnbc/nbc_iscan.c b/ompi/mca/coll/libnbc/nbc_iscan.c index 33374ede7a..ccc531d669 100644 --- a/ompi/mca/coll/libnbc/nbc_iscan.c +++ b/ompi/mca/coll/libnbc/nbc_iscan.c @@ -18,8 +18,20 @@ * Author(s): Torsten Hoefler * */ +#include "opal/align.h" +#include "ompi/op/op.h" + #include "nbc_internal.h" +static inline int scan_sched_linear( + int rank, int comm_size, const void *sendbuf, void *recvbuf, int count, + MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, + void *tmpbuf); +static inline int scan_sched_recursivedoubling( + int rank, int comm_size, const void *sendbuf, void *recvbuf, + int count, MPI_Datatype datatype, MPI_Op op, char inplace, + NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2); + #ifdef NBC_CACHE_SCHEDULE /* tree comparison function for schedule cache */ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { @@ -39,27 +51,41 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { } #endif -/* linear iscan - * working principle: - * 1. each node (but node 0) receives from left neighbor - * 2. performs op - * 3. all but rank p-1 do sends to it's right neighbor and exits - * - */ static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { - int rank, p, res; - ptrdiff_t gap, span; - NBC_Schedule *schedule; - void *tmpbuf = NULL; - char inplace; - ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + int rank, p, res; + ptrdiff_t gap, span; + NBC_Schedule *schedule; + void *tmpbuf = NULL, *tmpbuf1 = NULL, *tmpbuf2 = NULL; + enum { NBC_SCAN_LINEAR, NBC_SCAN_RDBL } alg; + char inplace; + ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; - NBC_IN_PLACE(sendbuf, recvbuf, inplace); + NBC_IN_PLACE(sendbuf, recvbuf, inplace); - rank = ompi_comm_rank (comm); - p = ompi_comm_size (comm); + rank = ompi_comm_rank (comm); + p = ompi_comm_size (comm); + + if (count == 0) { + return nbc_get_noop_request(persistent, request); + } + + span = opal_datatype_span(&datatype->super, count, &gap); + if (libnbc_iscan_algorithm == 2) { + alg = NBC_SCAN_RDBL; + ptrdiff_t span_align = OPAL_ALIGN(span, datatype->super.align, ptrdiff_t); + tmpbuf = malloc(span_align + span); + if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } + tmpbuf1 = (void *)(-gap); + tmpbuf2 = (char *)(span_align) - gap; + } else { + alg = NBC_SCAN_LINEAR; + if (rank > 0) { + tmpbuf = malloc(span); + if (NULL == tmpbuf) { return OMPI_ERR_OUT_OF_RESOURCE; } + } + } #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; @@ -75,60 +101,28 @@ static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Data #endif schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - if (!inplace) { - /* copy data to receivebuf */ - res = NBC_Sched_copy ((void *)sendbuf, false, count, datatype, - recvbuf, false, count, datatype, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - return res; - } - } - - if(rank != 0) { - span = opal_datatype_span(&datatype->super, count, &gap); - tmpbuf = malloc (span); - if (NULL == tmpbuf) { - OBJ_RELEASE(schedule); + free(tmpbuf); return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* we have to wait until we have the data */ - res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule, true); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } - - /* perform the reduce in my local buffer */ - /* this cannot be done until tmpbuf is unused :-( so barrier after the op */ - res = NBC_Sched_op ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule, - true); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } } - if (rank != p-1) { - res = NBC_Sched_send (recvbuf, false, count, datatype, rank+1, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } + if (alg == NBC_SCAN_LINEAR) { + res = scan_sched_linear(rank, p, sendbuf, recvbuf, count, datatype, + op, inplace, schedule, tmpbuf); + } else { + res = scan_sched_recursivedoubling(rank, p, sendbuf, recvbuf, count, + datatype, op, inplace, schedule, tmpbuf1, tmpbuf2); } - - res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; + OBJ_RELEASE(schedule); + free(tmpbuf); + return res; + } + + res = NBC_Sched_commit(schedule); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + OBJ_RELEASE(schedule); + free(tmpbuf); + return res; } #ifdef NBC_CACHE_SCHEDULE @@ -162,14 +156,160 @@ static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Data } #endif - res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - OBJ_RELEASE(schedule); - free(tmpbuf); - return res; - } + res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + OBJ_RELEASE(schedule); + free(tmpbuf); + return res; + } - return OMPI_SUCCESS; + return OMPI_SUCCESS; +} + +/* + * scan_sched_linear: + * + * Function: Linear algorithm for inclusive scan. + * Accepts: Same as MPI_Iscan + * Returns: MPI_SUCCESS or error code + * + * Working principle: + * 1. Each process (but process 0) receives from left neighbor + * 2. Performs op + * 3. All but rank p-1 do sends to it's right neighbor and exits + * + * Schedule length: O(1) + */ +static inline int scan_sched_linear( + int rank, int comm_size, const void *sendbuf, void *recvbuf, int count, + MPI_Datatype datatype, MPI_Op op, char inplace, NBC_Schedule *schedule, + void *tmpbuf) +{ + int res = OMPI_SUCCESS; + + if (!inplace) { + /* Copy data to recvbuf */ + res = NBC_Sched_copy((void *)sendbuf, false, count, datatype, + recvbuf, false, count, datatype, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + + if (rank > 0) { + ptrdiff_t gap; + opal_datatype_span(&datatype->super, count, &gap); + /* We have to wait until we have the data */ + res = NBC_Sched_recv((void *)(-gap), true, count, datatype, rank - 1, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + /* Perform the reduce in my local buffer */ + /* this cannot be done until tmpbuf is unused :-( so barrier after the op */ + res = NBC_Sched_op((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule, + true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + + if (rank != comm_size - 1) { + res = NBC_Sched_send(recvbuf, false, count, datatype, rank + 1, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + +cleanup_and_return: + return res; +} + +/* + * scan_sched_recursivedoubling: + * + * Function: Recursive doubling algorithm for inclusive scan. + * Accepts: Same as MPI_Iscan + * Returns: MPI_SUCCESS or error code + * + * Description: Implements recursive doubling algorithm for MPI_Iscan. + * The algorithm preserves order of operations so it can + * be used both by commutative and non-commutative operations. + * + * Example for 5 processes and commutative operation MPI_SUM: + * Process: 0 1 2 3 4 + * recvbuf: [0] [1] [2] [3] [4] + * psend: [0] [1] [2] [3] [4] + * + * Step 1: + * recvbuf: [0] [0+1] [2] [2+3] [4] + * psend: [1+0] [0+1] [3+2] [2+3] [4] + * + * Step 2: + * recvbuf: [0] [0+1] [(1+0)+2] [(1+0)+(2+3)] [4] + * psend: [(3+2)+(1+0)] [(2+3)+(0+1)] [(1+0)+(3+2)] [(1+0)+(2+3)] [4] + * + * Step 3: + * recvbuf: [0] [0+1] [(1+0)+2] [(1+0)+(2+3)] [((3+2)+(1+0))+4] + * psend: [4+((3+2)+(1+0))] [((3+2)+(1+0))+4] + * + * Time complexity (worst case): \ceil(\log_2(p))(2\alpha + 2m\beta + 2m\gamma) + * Memory requirements (per process): 2 * count * typesize = O(count) + * Limitations: intra-communicators only + * Schedule length: O(log(p)) + */ +static inline int scan_sched_recursivedoubling( + int rank, int comm_size, const void *sendbuf, void *recvbuf, int count, + MPI_Datatype datatype, MPI_Op op, char inplace, + NBC_Schedule *schedule, void *tmpbuf1, void *tmpbuf2) +{ + int res = OMPI_SUCCESS; + + if (!inplace) { + res = NBC_Sched_copy((void *)sendbuf, false, count, datatype, + recvbuf, false, count, datatype, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } + if (comm_size < 2) + goto cleanup_and_return; + + char *psend = (char *)tmpbuf1; + char *precv = (char *)tmpbuf2; + res = NBC_Sched_copy(recvbuf, false, count, datatype, + psend, true, count, datatype, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + int is_commute = ompi_op_is_commute(op); + for (int mask = 1; mask < comm_size; mask <<= 1) { + int remote = rank ^ mask; + if (remote < comm_size) { + res = NBC_Sched_send(psend, true, count, datatype, remote, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + res = NBC_Sched_recv(precv, true, count, datatype, remote, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + + if (rank > remote) { + /* Accumulate prefix reduction: recvbuf = precv recvbuf */ + res = NBC_Sched_op(precv, true, recvbuf, false, count, + datatype, op, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + /* Partial result: psend = precv psend */ + res = NBC_Sched_op(precv, true, psend, true, count, + datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } else { + if (is_commute) { + /* psend = precv psend */ + res = NBC_Sched_op(precv, true, psend, true, count, + datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + } else { + /* precv = psend precv */ + res = NBC_Sched_op(psend, true, precv, true, count, + datatype, op, schedule, true); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } + char *tmp = psend; + psend = precv; + precv = tmp; + } + } + } + } + + cleanup_and_return: + return res; } int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index d6fc4b89bd..e4d66cc600 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -5,6 +5,7 @@ * reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,6 +42,10 @@ extern int ompi_coll_tuned_alltoall_intermediate_msg; extern int ompi_coll_tuned_alltoall_large_msg; extern int ompi_coll_tuned_alltoall_min_procs; extern int ompi_coll_tuned_alltoall_max_requests; +extern int ompi_coll_tuned_scatter_intermediate_msg; +extern int ompi_coll_tuned_scatter_large_msg; +extern int ompi_coll_tuned_scatter_min_procs; +extern int ompi_coll_tuned_scatter_blocking_send_ratio; /* forced algorithm choices */ /* this structure is for storing the indexes to the forced algorithm mca params... */ diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index 25e9bc77a0..a17cfacb12 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -16,6 +16,7 @@ * reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,6 +65,12 @@ int ompi_coll_tuned_alltoall_large_msg = 3000; int ompi_coll_tuned_alltoall_min_procs = 0; /* disable by default */ int ompi_coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */ +/* Disable by default */ +int ompi_coll_tuned_scatter_intermediate_msg = 0; +int ompi_coll_tuned_scatter_large_msg = 0; +int ompi_coll_tuned_scatter_min_procs = 0; +int ompi_coll_tuned_scatter_blocking_send_ratio = 0; + /* forced alogrithm variables */ /* indices for the MCA parameters */ coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}}; diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index 97560c5c08..b3699ed273 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -15,6 +15,7 @@ * reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -780,6 +781,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount, { const size_t small_block_size = 300; const int small_comm_size = 10; + const int intermediate_comm_size = 64; int communicator_size, rank; size_t dsize, block_size; @@ -802,7 +804,16 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount, return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); + } else if ((communicator_size < ompi_coll_tuned_scatter_min_procs) && + (communicator_size > intermediate_comm_size) && + (block_size >= ompi_coll_tuned_scatter_intermediate_msg) && + (block_size < ompi_coll_tuned_scatter_large_msg)) { + return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module, + ompi_coll_tuned_scatter_blocking_send_ratio); } + return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c index eab5009183..7ba85078fd 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c @@ -90,14 +90,14 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** fptr = fopen (fname, "r"); if (!fptr) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"cannot read rules file [%s]\n", fname)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot read rules file [%s]\n", fname)); goto on_file_error; } /* make space and init the algorithm rules for each of the n_collectives MPI collectives */ alg_rules = ompi_coll_tuned_mk_alg_rules (n_collectives); if (NULL == alg_rules) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"cannot cannot allocate rules for file [%s]\n", fname)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate rules for file [%s]\n", fname)); goto on_file_error; } @@ -142,6 +142,10 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %d for dynamic rule for collective ID %d\n", NCS, CI)); alg_p->n_com_sizes = NCS; alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI); + if (NULL == alg_p->com_rules) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate com rules for file [%s]\n", fname)); + goto on_file_error; + } for (ncs=0;ncsn_msg_sizes = NMS; com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS); + if (NULL == com_p->msg_rules) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate msg rules for file [%s]\n", fname)); + goto on_file_error; + } msg_p = com_p->msg_rules; diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index ff409b5eba..bf2c7da143 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -91,7 +91,7 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority) tuned_module->super.coll_module_enable = tuned_module_enable; tuned_module->super.ft_event = mca_coll_tuned_ft_event; - /* By default stick with the fied version of the tuned collectives. Later on, + /* By default stick with the fixed version of the tuned collectives. Later on, * when the module get enabled, set the correct version based on the availability * of the dynamic rules. */ diff --git a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c index b7bcdd6be8..df1176ff4e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c @@ -5,6 +5,7 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,6 +37,7 @@ static mca_base_var_enum_value_t scatter_algorithms[] = { {0, "ignore"}, {1, "basic_linear"}, {2, "binomial"}, + {3, "linear_nb"}, {0, NULL} }; @@ -74,7 +76,7 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p mca_param_indices->algorithm_param_index = mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "scatter_algorithm", - "Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.", + "Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 non-blocking linear.", MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, @@ -114,6 +116,38 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_scatter_chain_fanout); + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "scatter_min_procs", + "use basic linear algorithm for communicators larger than this value", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_scatter_min_procs); + + (void)mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "scatter_algorithm_max_requests", + "Issue a blocking send every this many non-blocking requests. Only has meaning for non-blocking linear algorithm.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &ompi_coll_tuned_scatter_blocking_send_ratio); + + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "scatter_intermediate_msg", + "use non-blocking linear algorithm for messages larger than this value", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_scatter_intermediate_msg); + + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "scatter_large_msg", + "use linear algorithm for messages larger than this value", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_scatter_large_msg); + return (MPI_SUCCESS); } @@ -144,6 +178,11 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, int scount, return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); + case (3): + return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module, + ompi_coll_tuned_scatter_blocking_send_ratio); } /* switch */ OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 9d6cc8ade2..08e9f77c25 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -4,6 +4,7 @@ * reserved. * Copyright (c) 2020 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -389,7 +390,7 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind } #endif -#if OPAL_DEBUG_ENABLE +#if OPAL_ENABLE_DEBUG opal_output_verbose(1, opal_common_ofi.output, "local rank: %d device: %s cpusets match: %s\n", local_index, provider->domain_attr->name,