COLL/TUNED: Add linear scatter using isend for mlnx platform

Signed-off-by: Mikhail Brinskii <mikhailb@mellanox.com> (cherry picked from commit f2cbd4806e9a38b5e58c0fc69b41624af79fb99b) Signed-off-by: Brian Barrett <bbarrett@amazon.com>
2019-10-24 16:06:27 +00:00 · 2019-10-24 16:06:27 +00:00 · 7eb94164a0
--- a/contrib/platform/mellanox/optimized.conf
+++ b/contrib/platform/mellanox/optimized.conf
@ -10,6 +10,7 @@
 # Copyright (c) 2004-2005 The Regents of the University of California.
 #                         All rights reserved.
 # Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
+# Copyright (c) 2019      Mellanox Technologies. All rights reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@ -84,4 +85,8 @@ bml_r2_show_unreach_errors = 0
 coll_tuned_alltoall_large_msg              = 250000
 coll_tuned_alltoall_min_procs              = 2048
 coll_tuned_alltoall_algorithm_max_requests = 8
+coll_tuned_scatter_intermediate_msg        = 8192
+coll_tuned_scatter_large_msg               = 250000
+coll_tuned_scatter_min_procs               = 1048510
+coll_tuned_scatter_algorithm_max_requests  = 64

--- a/ompi/mca/coll/base/coll_base_functions.h
+++ b/ompi/mca/coll/base/coll_base_functions.h
@ -18,6 +18,7 @@
 *                         and Technology (RIST). All rights reserved.
 * Copyright (c) 2016-2017 IBM Corporation.  All rights reserved.
 * Copyright (c) 2017      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2019      Mellanox Technologies. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -291,6 +292,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
 /* Scatter */
 int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
 int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
+int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs);

 /* ScatterV */

--- a/ompi/mca/coll/base/coll_base_scatter.c
+++ b/ompi/mca/coll/base/coll_base_scatter.c
@ -14,6 +14,7 @@
 *                         reserved.
 * Copyright (c) 2015-2016 Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2019      Mellanox Technologies. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -273,5 +274,114 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount,
    return MPI_SUCCESS;
 }

-
 /* copied function (with appropriate renaming) ends here */
+
+/*
+ * Use isends for distributing the data with periodic sync by blocking send.
+ * Blocking send acts like a local resources flush, because it ensures
+ * progression until the message is sent/(copied to some sort of transmit buffer).
+ */
+int
+ompi_coll_base_scatter_intra_linear_nb(const void *sbuf, int scount,
+                                       struct ompi_datatype_t *sdtype,
+                                       void *rbuf, int rcount,
+                                       struct ompi_datatype_t *rdtype,
+                                       int root,
+                                       struct ompi_communicator_t *comm,
+                                       mca_coll_base_module_t *module,
+                                       int max_reqs)
+{
+    int i, rank, size, err, line, nreqs;
+    ptrdiff_t incr;
+    char *ptmp;
+    ompi_request_t **reqs = NULL, **preq;
+
+    rank = ompi_comm_rank(comm);
+    size = ompi_comm_size(comm);
+
+    /* If not root, receive data. */
+    if (rank != root) {
+        err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
+                                MCA_COLL_BASE_TAG_SCATTER,
+                                comm, MPI_STATUS_IGNORE));
+        if (MPI_SUCCESS != err) {
+            line = __LINE__; goto err_hndl;
+        }
+
+        return MPI_SUCCESS;
+    }
+
+    if (max_reqs <= 1) {
+        max_reqs = 0;
+        nreqs = size - 1; /* no send for myself */
+    } else {
+        /* We use blocking MPI_Send (which does not need a request)
+         * every max_reqs send operation (which is size/max_reqs at most),
+         * therefore no need to allocate requests for these sends. */
+        nreqs = size - (size / max_reqs);
+    }
+
+    reqs = ompi_coll_base_comm_get_reqs(module->base_data, nreqs);
+    if (NULL == reqs) {
+        err = OMPI_ERR_OUT_OF_RESOURCE;
+        line = __LINE__; goto err_hndl;
+    }
+
+    err = ompi_datatype_type_extent(sdtype, &incr);
+    if (OMPI_SUCCESS != err) {
+        line = __LINE__; goto err_hndl;
+    }
+    incr *= scount;
+
+    /* I am the root, loop sending data. */
+    for (i = 0, ptmp = (char *)sbuf, preq = reqs; i < size; ++i, ptmp += incr) {
+        /* simple optimization */
+        if (i == rank) {
+            if (MPI_IN_PLACE != rbuf) {
+                err = ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
+                                           rdtype);
+            }
+        } else {
+            if (!max_reqs || (i % max_reqs)) {
+                err = MCA_PML_CALL(isend(ptmp, scount, sdtype, i,
+                                         MCA_COLL_BASE_TAG_SCATTER,
+                                         MCA_PML_BASE_SEND_STANDARD,
+                                         comm, preq++));
+            } else {
+                err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
+                                        MCA_COLL_BASE_TAG_SCATTER,
+                                        MCA_PML_BASE_SEND_STANDARD,
+                                        comm));
+            }
+        }
+        if (MPI_SUCCESS != err) {
+            line = __LINE__; goto err_hndl;
+        }
+    }
+
+    err = ompi_request_wait_all(preq - reqs, reqs, MPI_STATUSES_IGNORE);
+    if (MPI_SUCCESS != err) {
+        line = __LINE__; goto err_hndl;
+    }
+
+    return MPI_SUCCESS;
+
+err_hndl:
+    if (NULL != reqs) {
+        /* find a real error code */
+        if (MPI_ERR_IN_STATUS == err) {
+            for (i = 0; i < nreqs; i++) {
+                if (MPI_REQUEST_NULL == reqs[i]) continue;
+                if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
+                err = reqs[i]->req_status.MPI_ERROR;
+                break;
+            }
+        }
+        ompi_coll_base_free_reqs(reqs, nreqs);
+    }
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
+                "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank));
+    (void)line;  /* silence compiler warning */
+    return err;
+}
+
--- a/ompi/mca/coll/tuned/coll_tuned.h
+++ b/ompi/mca/coll/tuned/coll_tuned.h
@ -5,6 +5,7 @@
 *                         reserved.
 * Copyright (c) 2015-2018 Research Organization for Information Science
 *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2019      Mellanox Technologies. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -41,6 +42,10 @@ extern int   ompi_coll_tuned_alltoall_intermediate_msg;
 extern int   ompi_coll_tuned_alltoall_large_msg;
 extern int   ompi_coll_tuned_alltoall_min_procs;
 extern int   ompi_coll_tuned_alltoall_max_requests;
+extern int   ompi_coll_tuned_scatter_intermediate_msg;
+extern int   ompi_coll_tuned_scatter_large_msg;
+extern int   ompi_coll_tuned_scatter_min_procs;
+extern int   ompi_coll_tuned_scatter_blocking_send_ratio;

 /* forced algorithm choices */
 /* this structure is for storing the indexes to the forced algorithm mca params... */
--- a/ompi/mca/coll/tuned/coll_tuned_component.c
+++ b/ompi/mca/coll/tuned/coll_tuned_component.c
@ -16,6 +16,7 @@
 *                         reserved.
 * Copyright (c) 2015-2018 Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2019      Mellanox Technologies. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -64,6 +65,12 @@ int   ompi_coll_tuned_alltoall_large_msg = 3000;
 int   ompi_coll_tuned_alltoall_min_procs = 0; /* disable by default */
 int   ompi_coll_tuned_alltoall_max_requests  = 0; /* no limit for alltoall by default */

+/* Disable by default */
+int   ompi_coll_tuned_scatter_intermediate_msg = 0;
+int   ompi_coll_tuned_scatter_large_msg = 0;
+int   ompi_coll_tuned_scatter_min_procs = 0;
+int   ompi_coll_tuned_scatter_blocking_send_ratio = 0;
+
 /* forced alogrithm variables */
 /* indices for the MCA parameters */
 coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}};
--- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
@ -15,6 +15,7 @@
 *                         reserved.
 * Copyright (c) 2015-2018 Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2019      Mellanox Technologies. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -780,6 +781,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
 {
    const size_t small_block_size = 300;
    const int small_comm_size = 10;
+    const int intermediate_comm_size = 64;
    int communicator_size, rank;
    size_t dsize, block_size;

@ -802,7 +804,16 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
        return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
                                                     rbuf, rcount, rdtype,
                                                     root, comm, module);
+    } else if ((communicator_size < ompi_coll_tuned_scatter_min_procs) &&
+               (communicator_size > intermediate_comm_size) &&
+               (block_size >= ompi_coll_tuned_scatter_intermediate_msg) &&
+               (block_size < ompi_coll_tuned_scatter_large_msg)) {
+        return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
+                                                      rbuf, rcount, rdtype,
+                                                      root, comm, module,
+                                                      ompi_coll_tuned_scatter_blocking_send_ratio);
    }
+
    return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
                                                     rbuf, rcount, rdtype,
                                                     root, comm, module);
--- a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
@ -5,6 +5,7 @@
 *                         reserved.
 * Copyright (c) 2015      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2019      Mellanox Technologies. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
@ -36,6 +37,7 @@ static mca_base_var_enum_value_t scatter_algorithms[] = {
    {0, "ignore"},
    {1, "basic_linear"},
    {2, "binomial"},
+    {3, "linear_nb"},
    {0, NULL}
 };

@ -74,7 +76,7 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p
    mca_param_indices->algorithm_param_index =
        mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
                                        "scatter_algorithm",
-                                        "Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
+                                        "Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 non-blocking linear.",
                                        MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                        OPAL_INFO_LVL_5,
                                        MCA_BASE_VAR_SCOPE_ALL,
@ -114,6 +116,38 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p
                                      MCA_BASE_VAR_SCOPE_ALL,
                                      &coll_tuned_scatter_chain_fanout);

+    (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
+                                           "scatter_min_procs",
+                                           "use basic linear algorithm for communicators larger than this value",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           OPAL_INFO_LVL_6,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &ompi_coll_tuned_scatter_min_procs);
+
+    (void)mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
+                                          "scatter_algorithm_max_requests",
+                                          "Issue a blocking send every this many non-blocking requests. Only has meaning for non-blocking linear algorithm.",
+                                          MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                          OPAL_INFO_LVL_5,
+                                          MCA_BASE_VAR_SCOPE_ALL,
+                                          &ompi_coll_tuned_scatter_blocking_send_ratio);
+
+    (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
+                                           "scatter_intermediate_msg",
+                                           "use non-blocking linear algorithm for messages larger than this value",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           OPAL_INFO_LVL_6,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &ompi_coll_tuned_scatter_intermediate_msg);
+
+    (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
+                                           "scatter_large_msg",
+                                           "use linear algorithm for messages larger than this value",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           OPAL_INFO_LVL_6,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &ompi_coll_tuned_scatter_large_msg);
+
    return (MPI_SUCCESS);
 }

@ -144,6 +178,11 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, int scount,
        return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
                                                     rbuf, rcount, rdtype,
                                                     root, comm, module);
+    case (3):
+        return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
+                                                      rbuf, rcount, rdtype,
+                                                      root, comm, module,
+                                                      ompi_coll_tuned_scatter_blocking_send_ratio);
    } /* switch */
    OPAL_OUTPUT((ompi_coll_tuned_stream,
                 "coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",