1
1
Fork 0

COLL/TUNED: Add linear scatter using isend for mlnx platform

Signed-off-by: Mikhail Brinskii <mikhailb@mellanox.com>
(cherry picked from commit f2cbd4806e)
Signed-off-by: Brian Barrett <bbarrett@amazon.com>
This commit is contained in:
Mikhail Brinskii 2019-10-24 16:06:27 +00:00 committed by Brian Barrett
parent 221fad6862
commit 7eb94164a0
7 changed files with 181 additions and 2 deletions

View File

@ -10,6 +10,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2019 Mellanox Technologies. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -84,4 +85,8 @@ bml_r2_show_unreach_errors = 0
coll_tuned_alltoall_large_msg = 250000
coll_tuned_alltoall_min_procs = 2048
coll_tuned_alltoall_algorithm_max_requests = 8
coll_tuned_scatter_intermediate_msg = 8192
coll_tuned_scatter_large_msg = 250000
coll_tuned_scatter_min_procs = 1048510
coll_tuned_scatter_algorithm_max_requests = 64

View File

@ -18,6 +18,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -291,6 +292,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
/* Scatter */
int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs);
/* ScatterV */

View File

@ -14,6 +14,7 @@
* reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -273,5 +274,114 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount,
return MPI_SUCCESS;
}
/* copied function (with appropriate renaming) ends here */
/*
* Use isends for distributing the data with periodic sync by blocking send.
* Blocking send acts like a local resources flush, because it ensures
* progression until the message is sent/(copied to some sort of transmit buffer).
*/
int
ompi_coll_base_scatter_intra_linear_nb(const void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int max_reqs)
{
int i, rank, size, err, line, nreqs;
ptrdiff_t incr;
char *ptmp;
ompi_request_t **reqs = NULL, **preq;
rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm);
/* If not root, receive data. */
if (rank != root) {
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
MCA_COLL_BASE_TAG_SCATTER,
comm, MPI_STATUS_IGNORE));
if (MPI_SUCCESS != err) {
line = __LINE__; goto err_hndl;
}
return MPI_SUCCESS;
}
if (max_reqs <= 1) {
max_reqs = 0;
nreqs = size - 1; /* no send for myself */
} else {
/* We use blocking MPI_Send (which does not need a request)
* every max_reqs send operation (which is size/max_reqs at most),
* therefore no need to allocate requests for these sends. */
nreqs = size - (size / max_reqs);
}
reqs = ompi_coll_base_comm_get_reqs(module->base_data, nreqs);
if (NULL == reqs) {
err = OMPI_ERR_OUT_OF_RESOURCE;
line = __LINE__; goto err_hndl;
}
err = ompi_datatype_type_extent(sdtype, &incr);
if (OMPI_SUCCESS != err) {
line = __LINE__; goto err_hndl;
}
incr *= scount;
/* I am the root, loop sending data. */
for (i = 0, ptmp = (char *)sbuf, preq = reqs; i < size; ++i, ptmp += incr) {
/* simple optimization */
if (i == rank) {
if (MPI_IN_PLACE != rbuf) {
err = ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
rdtype);
}
} else {
if (!max_reqs || (i % max_reqs)) {
err = MCA_PML_CALL(isend(ptmp, scount, sdtype, i,
MCA_COLL_BASE_TAG_SCATTER,
MCA_PML_BASE_SEND_STANDARD,
comm, preq++));
} else {
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
MCA_COLL_BASE_TAG_SCATTER,
MCA_PML_BASE_SEND_STANDARD,
comm));
}
}
if (MPI_SUCCESS != err) {
line = __LINE__; goto err_hndl;
}
}
err = ompi_request_wait_all(preq - reqs, reqs, MPI_STATUSES_IGNORE);
if (MPI_SUCCESS != err) {
line = __LINE__; goto err_hndl;
}
return MPI_SUCCESS;
err_hndl:
if (NULL != reqs) {
/* find a real error code */
if (MPI_ERR_IN_STATUS == err) {
for (i = 0; i < nreqs; i++) {
if (MPI_REQUEST_NULL == reqs[i]) continue;
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
err = reqs[i]->req_status.MPI_ERROR;
break;
}
}
ompi_coll_base_free_reqs(reqs, nreqs);
}
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank));
(void)line; /* silence compiler warning */
return err;
}

View File

@ -5,6 +5,7 @@
* reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -41,6 +42,10 @@ extern int ompi_coll_tuned_alltoall_intermediate_msg;
extern int ompi_coll_tuned_alltoall_large_msg;
extern int ompi_coll_tuned_alltoall_min_procs;
extern int ompi_coll_tuned_alltoall_max_requests;
extern int ompi_coll_tuned_scatter_intermediate_msg;
extern int ompi_coll_tuned_scatter_large_msg;
extern int ompi_coll_tuned_scatter_min_procs;
extern int ompi_coll_tuned_scatter_blocking_send_ratio;
/* forced algorithm choices */
/* this structure is for storing the indexes to the forced algorithm mca params... */

View File

@ -16,6 +16,7 @@
* reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -64,6 +65,12 @@ int ompi_coll_tuned_alltoall_large_msg = 3000;
int ompi_coll_tuned_alltoall_min_procs = 0; /* disable by default */
int ompi_coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
/* Disable by default */
int ompi_coll_tuned_scatter_intermediate_msg = 0;
int ompi_coll_tuned_scatter_large_msg = 0;
int ompi_coll_tuned_scatter_min_procs = 0;
int ompi_coll_tuned_scatter_blocking_send_ratio = 0;
/* forced alogrithm variables */
/* indices for the MCA parameters */
coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}};

View File

@ -15,6 +15,7 @@
* reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -780,6 +781,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
{
const size_t small_block_size = 300;
const int small_comm_size = 10;
const int intermediate_comm_size = 64;
int communicator_size, rank;
size_t dsize, block_size;
@ -802,7 +804,16 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
} else if ((communicator_size < ompi_coll_tuned_scatter_min_procs) &&
(communicator_size > intermediate_comm_size) &&
(block_size >= ompi_coll_tuned_scatter_intermediate_msg) &&
(block_size < ompi_coll_tuned_scatter_large_msg)) {
return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
ompi_coll_tuned_scatter_blocking_send_ratio);
}
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);

View File

@ -5,6 +5,7 @@
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -36,6 +37,7 @@ static mca_base_var_enum_value_t scatter_algorithms[] = {
{0, "ignore"},
{1, "basic_linear"},
{2, "binomial"},
{3, "linear_nb"},
{0, NULL}
};
@ -74,7 +76,7 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p
mca_param_indices->algorithm_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm",
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 non-blocking linear.",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_ALL,
@ -114,6 +116,38 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p
MCA_BASE_VAR_SCOPE_ALL,
&coll_tuned_scatter_chain_fanout);
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_min_procs",
"use basic linear algorithm for communicators larger than this value",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_coll_tuned_scatter_min_procs);
(void)mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_algorithm_max_requests",
"Issue a blocking send every this many non-blocking requests. Only has meaning for non-blocking linear algorithm.",
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_ALL,
&ompi_coll_tuned_scatter_blocking_send_ratio);
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_intermediate_msg",
"use non-blocking linear algorithm for messages larger than this value",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_coll_tuned_scatter_intermediate_msg);
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"scatter_large_msg",
"use linear algorithm for messages larger than this value",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_coll_tuned_scatter_large_msg);
return (MPI_SUCCESS);
}
@ -144,6 +178,11 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, int scount,
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module);
case (3):
return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
root, comm, module,
ompi_coll_tuned_scatter_blocking_send_ratio);
} /* switch */
OPAL_OUTPUT((ompi_coll_tuned_stream,
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",