diff --git a/contrib/platform/mellanox/optimized.conf b/contrib/platform/mellanox/optimized.conf index 048d85f342..b86b37c9e2 100644 --- a/contrib/platform/mellanox/optimized.conf +++ b/contrib/platform/mellanox/optimized.conf @@ -10,6 +10,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2019 Mellanox Technologies. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -84,4 +85,8 @@ bml_r2_show_unreach_errors = 0 coll_tuned_alltoall_large_msg = 250000 coll_tuned_alltoall_min_procs = 2048 coll_tuned_alltoall_algorithm_max_requests = 8 +coll_tuned_scatter_intermediate_msg = 8192 +coll_tuned_scatter_large_msg = 250000 +coll_tuned_scatter_min_procs = 1048510 +coll_tuned_scatter_algorithm_max_requests = 64 diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h index 40de8762eb..11b46ba47e 100644 --- a/ompi/mca/coll/base/coll_base_functions.h +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -18,6 +18,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2017 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -291,6 +292,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS); /* Scatter */ int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS); int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS); +int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs); /* ScatterV */ diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c index 648845689d..0ca3597153 100644 --- a/ompi/mca/coll/base/coll_base_scatter.c +++ b/ompi/mca/coll/base/coll_base_scatter.c @@ -14,6 +14,7 @@ * reserved. * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -273,5 +274,114 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount, return MPI_SUCCESS; } - /* copied function (with appropriate renaming) ends here */ + +/* + * Use isends for distributing the data with periodic sync by blocking send. + * Blocking send acts like a local resources flush, because it ensures + * progression until the message is sent/(copied to some sort of transmit buffer). + */ +int +ompi_coll_base_scatter_intra_linear_nb(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module, + int max_reqs) +{ + int i, rank, size, err, line, nreqs; + ptrdiff_t incr; + char *ptmp; + ompi_request_t **reqs = NULL, **preq; + + rank = ompi_comm_rank(comm); + size = ompi_comm_size(comm); + + /* If not root, receive data. */ + if (rank != root) { + err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root, + MCA_COLL_BASE_TAG_SCATTER, + comm, MPI_STATUS_IGNORE)); + if (MPI_SUCCESS != err) { + line = __LINE__; goto err_hndl; + } + + return MPI_SUCCESS; + } + + if (max_reqs <= 1) { + max_reqs = 0; + nreqs = size - 1; /* no send for myself */ + } else { + /* We use blocking MPI_Send (which does not need a request) + * every max_reqs send operation (which is size/max_reqs at most), + * therefore no need to allocate requests for these sends. */ + nreqs = size - (size / max_reqs); + } + + reqs = ompi_coll_base_comm_get_reqs(module->base_data, nreqs); + if (NULL == reqs) { + err = OMPI_ERR_OUT_OF_RESOURCE; + line = __LINE__; goto err_hndl; + } + + err = ompi_datatype_type_extent(sdtype, &incr); + if (OMPI_SUCCESS != err) { + line = __LINE__; goto err_hndl; + } + incr *= scount; + + /* I am the root, loop sending data. */ + for (i = 0, ptmp = (char *)sbuf, preq = reqs; i < size; ++i, ptmp += incr) { + /* simple optimization */ + if (i == rank) { + if (MPI_IN_PLACE != rbuf) { + err = ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount, + rdtype); + } + } else { + if (!max_reqs || (i % max_reqs)) { + err = MCA_PML_CALL(isend(ptmp, scount, sdtype, i, + MCA_COLL_BASE_TAG_SCATTER, + MCA_PML_BASE_SEND_STANDARD, + comm, preq++)); + } else { + err = MCA_PML_CALL(send(ptmp, scount, sdtype, i, + MCA_COLL_BASE_TAG_SCATTER, + MCA_PML_BASE_SEND_STANDARD, + comm)); + } + } + if (MPI_SUCCESS != err) { + line = __LINE__; goto err_hndl; + } + } + + err = ompi_request_wait_all(preq - reqs, reqs, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != err) { + line = __LINE__; goto err_hndl; + } + + return MPI_SUCCESS; + +err_hndl: + if (NULL != reqs) { + /* find a real error code */ + if (MPI_ERR_IN_STATUS == err) { + for (i = 0; i < nreqs; i++) { + if (MPI_REQUEST_NULL == reqs[i]) continue; + if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue; + err = reqs[i]->req_status.MPI_ERROR; + break; + } + } + ompi_coll_base_free_reqs(reqs, nreqs); + } + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); + (void)line; /* silence compiler warning */ + return err; +} + diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index d6fc4b89bd..e4d66cc600 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -5,6 +5,7 @@ * reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,6 +42,10 @@ extern int ompi_coll_tuned_alltoall_intermediate_msg; extern int ompi_coll_tuned_alltoall_large_msg; extern int ompi_coll_tuned_alltoall_min_procs; extern int ompi_coll_tuned_alltoall_max_requests; +extern int ompi_coll_tuned_scatter_intermediate_msg; +extern int ompi_coll_tuned_scatter_large_msg; +extern int ompi_coll_tuned_scatter_min_procs; +extern int ompi_coll_tuned_scatter_blocking_send_ratio; /* forced algorithm choices */ /* this structure is for storing the indexes to the forced algorithm mca params... */ diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index 25e9bc77a0..a17cfacb12 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -16,6 +16,7 @@ * reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,6 +65,12 @@ int ompi_coll_tuned_alltoall_large_msg = 3000; int ompi_coll_tuned_alltoall_min_procs = 0; /* disable by default */ int ompi_coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */ +/* Disable by default */ +int ompi_coll_tuned_scatter_intermediate_msg = 0; +int ompi_coll_tuned_scatter_large_msg = 0; +int ompi_coll_tuned_scatter_min_procs = 0; +int ompi_coll_tuned_scatter_blocking_send_ratio = 0; + /* forced alogrithm variables */ /* indices for the MCA parameters */ coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}}; diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index 97560c5c08..b3699ed273 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -15,6 +15,7 @@ * reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -780,6 +781,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount, { const size_t small_block_size = 300; const int small_comm_size = 10; + const int intermediate_comm_size = 64; int communicator_size, rank; size_t dsize, block_size; @@ -802,7 +804,16 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount, return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); + } else if ((communicator_size < ompi_coll_tuned_scatter_min_procs) && + (communicator_size > intermediate_comm_size) && + (block_size >= ompi_coll_tuned_scatter_intermediate_msg) && + (block_size < ompi_coll_tuned_scatter_large_msg)) { + return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module, + ompi_coll_tuned_scatter_blocking_send_ratio); } + return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); diff --git a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c index b7bcdd6be8..df1176ff4e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c @@ -5,6 +5,7 @@ * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,6 +37,7 @@ static mca_base_var_enum_value_t scatter_algorithms[] = { {0, "ignore"}, {1, "basic_linear"}, {2, "binomial"}, + {3, "linear_nb"}, {0, NULL} }; @@ -74,7 +76,7 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p mca_param_indices->algorithm_param_index = mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "scatter_algorithm", - "Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.", + "Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 non-blocking linear.", MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, @@ -114,6 +116,38 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_scatter_chain_fanout); + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "scatter_min_procs", + "use basic linear algorithm for communicators larger than this value", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_scatter_min_procs); + + (void)mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "scatter_algorithm_max_requests", + "Issue a blocking send every this many non-blocking requests. Only has meaning for non-blocking linear algorithm.", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_ALL, + &ompi_coll_tuned_scatter_blocking_send_ratio); + + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "scatter_intermediate_msg", + "use non-blocking linear algorithm for messages larger than this value", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_scatter_intermediate_msg); + + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "scatter_large_msg", + "use linear algorithm for messages larger than this value", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_scatter_large_msg); + return (MPI_SUCCESS); } @@ -144,6 +178,11 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, int scount, return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm, module); + case (3): + return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, module, + ompi_coll_tuned_scatter_blocking_send_ratio); } /* switch */ OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",