COLL/TUNED: Add linear scatter using isend for mlnx platform
Signed-off-by: Mikhail Brinskii <mikhailb@mellanox.com> (cherry picked from commit f2cbd4806e9a38b5e58c0fc69b41624af79fb99b) Signed-off-by: Brian Barrett <bbarrett@amazon.com>
Этот коммит содержится в:
родитель
221fad6862
Коммит
7eb94164a0
@ -10,6 +10,7 @@
|
|||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||||
|
# Copyright (c) 2019 Mellanox Technologies. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -84,4 +85,8 @@ bml_r2_show_unreach_errors = 0
|
|||||||
coll_tuned_alltoall_large_msg = 250000
|
coll_tuned_alltoall_large_msg = 250000
|
||||||
coll_tuned_alltoall_min_procs = 2048
|
coll_tuned_alltoall_min_procs = 2048
|
||||||
coll_tuned_alltoall_algorithm_max_requests = 8
|
coll_tuned_alltoall_algorithm_max_requests = 8
|
||||||
|
coll_tuned_scatter_intermediate_msg = 8192
|
||||||
|
coll_tuned_scatter_large_msg = 250000
|
||||||
|
coll_tuned_scatter_min_procs = 1048510
|
||||||
|
coll_tuned_scatter_algorithm_max_requests = 64
|
||||||
|
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
|
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
|
||||||
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
|
||||||
|
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -291,6 +292,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
|
|||||||
/* Scatter */
|
/* Scatter */
|
||||||
int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
|
int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
|
||||||
int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
|
int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
|
||||||
|
int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs);
|
||||||
|
|
||||||
/* ScatterV */
|
/* ScatterV */
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015-2016 Research Organization for Information Science
|
* Copyright (c) 2015-2016 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -273,5 +274,114 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount,
|
|||||||
return MPI_SUCCESS;
|
return MPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* copied function (with appropriate renaming) ends here */
|
/* copied function (with appropriate renaming) ends here */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Use isends for distributing the data with periodic sync by blocking send.
|
||||||
|
* Blocking send acts like a local resources flush, because it ensures
|
||||||
|
* progression until the message is sent/(copied to some sort of transmit buffer).
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
ompi_coll_base_scatter_intra_linear_nb(const void *sbuf, int scount,
|
||||||
|
struct ompi_datatype_t *sdtype,
|
||||||
|
void *rbuf, int rcount,
|
||||||
|
struct ompi_datatype_t *rdtype,
|
||||||
|
int root,
|
||||||
|
struct ompi_communicator_t *comm,
|
||||||
|
mca_coll_base_module_t *module,
|
||||||
|
int max_reqs)
|
||||||
|
{
|
||||||
|
int i, rank, size, err, line, nreqs;
|
||||||
|
ptrdiff_t incr;
|
||||||
|
char *ptmp;
|
||||||
|
ompi_request_t **reqs = NULL, **preq;
|
||||||
|
|
||||||
|
rank = ompi_comm_rank(comm);
|
||||||
|
size = ompi_comm_size(comm);
|
||||||
|
|
||||||
|
/* If not root, receive data. */
|
||||||
|
if (rank != root) {
|
||||||
|
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
|
||||||
|
MCA_COLL_BASE_TAG_SCATTER,
|
||||||
|
comm, MPI_STATUS_IGNORE));
|
||||||
|
if (MPI_SUCCESS != err) {
|
||||||
|
line = __LINE__; goto err_hndl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return MPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max_reqs <= 1) {
|
||||||
|
max_reqs = 0;
|
||||||
|
nreqs = size - 1; /* no send for myself */
|
||||||
|
} else {
|
||||||
|
/* We use blocking MPI_Send (which does not need a request)
|
||||||
|
* every max_reqs send operation (which is size/max_reqs at most),
|
||||||
|
* therefore no need to allocate requests for these sends. */
|
||||||
|
nreqs = size - (size / max_reqs);
|
||||||
|
}
|
||||||
|
|
||||||
|
reqs = ompi_coll_base_comm_get_reqs(module->base_data, nreqs);
|
||||||
|
if (NULL == reqs) {
|
||||||
|
err = OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
|
line = __LINE__; goto err_hndl;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = ompi_datatype_type_extent(sdtype, &incr);
|
||||||
|
if (OMPI_SUCCESS != err) {
|
||||||
|
line = __LINE__; goto err_hndl;
|
||||||
|
}
|
||||||
|
incr *= scount;
|
||||||
|
|
||||||
|
/* I am the root, loop sending data. */
|
||||||
|
for (i = 0, ptmp = (char *)sbuf, preq = reqs; i < size; ++i, ptmp += incr) {
|
||||||
|
/* simple optimization */
|
||||||
|
if (i == rank) {
|
||||||
|
if (MPI_IN_PLACE != rbuf) {
|
||||||
|
err = ompi_datatype_sndrcv(ptmp, scount, sdtype, rbuf, rcount,
|
||||||
|
rdtype);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!max_reqs || (i % max_reqs)) {
|
||||||
|
err = MCA_PML_CALL(isend(ptmp, scount, sdtype, i,
|
||||||
|
MCA_COLL_BASE_TAG_SCATTER,
|
||||||
|
MCA_PML_BASE_SEND_STANDARD,
|
||||||
|
comm, preq++));
|
||||||
|
} else {
|
||||||
|
err = MCA_PML_CALL(send(ptmp, scount, sdtype, i,
|
||||||
|
MCA_COLL_BASE_TAG_SCATTER,
|
||||||
|
MCA_PML_BASE_SEND_STANDARD,
|
||||||
|
comm));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (MPI_SUCCESS != err) {
|
||||||
|
line = __LINE__; goto err_hndl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = ompi_request_wait_all(preq - reqs, reqs, MPI_STATUSES_IGNORE);
|
||||||
|
if (MPI_SUCCESS != err) {
|
||||||
|
line = __LINE__; goto err_hndl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return MPI_SUCCESS;
|
||||||
|
|
||||||
|
err_hndl:
|
||||||
|
if (NULL != reqs) {
|
||||||
|
/* find a real error code */
|
||||||
|
if (MPI_ERR_IN_STATUS == err) {
|
||||||
|
for (i = 0; i < nreqs; i++) {
|
||||||
|
if (MPI_REQUEST_NULL == reqs[i]) continue;
|
||||||
|
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
|
||||||
|
err = reqs[i]->req_status.MPI_ERROR;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ompi_coll_base_free_reqs(reqs, nreqs);
|
||||||
|
}
|
||||||
|
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
|
||||||
|
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank));
|
||||||
|
(void)line; /* silence compiler warning */
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015-2018 Research Organization for Information Science
|
* Copyright (c) 2015-2018 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -41,6 +42,10 @@ extern int ompi_coll_tuned_alltoall_intermediate_msg;
|
|||||||
extern int ompi_coll_tuned_alltoall_large_msg;
|
extern int ompi_coll_tuned_alltoall_large_msg;
|
||||||
extern int ompi_coll_tuned_alltoall_min_procs;
|
extern int ompi_coll_tuned_alltoall_min_procs;
|
||||||
extern int ompi_coll_tuned_alltoall_max_requests;
|
extern int ompi_coll_tuned_alltoall_max_requests;
|
||||||
|
extern int ompi_coll_tuned_scatter_intermediate_msg;
|
||||||
|
extern int ompi_coll_tuned_scatter_large_msg;
|
||||||
|
extern int ompi_coll_tuned_scatter_min_procs;
|
||||||
|
extern int ompi_coll_tuned_scatter_blocking_send_ratio;
|
||||||
|
|
||||||
/* forced algorithm choices */
|
/* forced algorithm choices */
|
||||||
/* this structure is for storing the indexes to the forced algorithm mca params... */
|
/* this structure is for storing the indexes to the forced algorithm mca params... */
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015-2018 Research Organization for Information Science
|
* Copyright (c) 2015-2018 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -64,6 +65,12 @@ int ompi_coll_tuned_alltoall_large_msg = 3000;
|
|||||||
int ompi_coll_tuned_alltoall_min_procs = 0; /* disable by default */
|
int ompi_coll_tuned_alltoall_min_procs = 0; /* disable by default */
|
||||||
int ompi_coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
|
int ompi_coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
|
||||||
|
|
||||||
|
/* Disable by default */
|
||||||
|
int ompi_coll_tuned_scatter_intermediate_msg = 0;
|
||||||
|
int ompi_coll_tuned_scatter_large_msg = 0;
|
||||||
|
int ompi_coll_tuned_scatter_min_procs = 0;
|
||||||
|
int ompi_coll_tuned_scatter_blocking_send_ratio = 0;
|
||||||
|
|
||||||
/* forced alogrithm variables */
|
/* forced alogrithm variables */
|
||||||
/* indices for the MCA parameters */
|
/* indices for the MCA parameters */
|
||||||
coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}};
|
coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}};
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015-2018 Research Organization for Information Science
|
* Copyright (c) 2015-2018 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -780,6 +781,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
|
|||||||
{
|
{
|
||||||
const size_t small_block_size = 300;
|
const size_t small_block_size = 300;
|
||||||
const int small_comm_size = 10;
|
const int small_comm_size = 10;
|
||||||
|
const int intermediate_comm_size = 64;
|
||||||
int communicator_size, rank;
|
int communicator_size, rank;
|
||||||
size_t dsize, block_size;
|
size_t dsize, block_size;
|
||||||
|
|
||||||
@ -802,7 +804,16 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, int scount,
|
|||||||
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
|
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
|
} else if ((communicator_size < ompi_coll_tuned_scatter_min_procs) &&
|
||||||
|
(communicator_size > intermediate_comm_size) &&
|
||||||
|
(block_size >= ompi_coll_tuned_scatter_intermediate_msg) &&
|
||||||
|
(block_size < ompi_coll_tuned_scatter_large_msg)) {
|
||||||
|
return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
|
||||||
|
rbuf, rcount, rdtype,
|
||||||
|
root, comm, module,
|
||||||
|
ompi_coll_tuned_scatter_blocking_send_ratio);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
|
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -36,6 +37,7 @@ static mca_base_var_enum_value_t scatter_algorithms[] = {
|
|||||||
{0, "ignore"},
|
{0, "ignore"},
|
||||||
{1, "basic_linear"},
|
{1, "basic_linear"},
|
||||||
{2, "binomial"},
|
{2, "binomial"},
|
||||||
|
{3, "linear_nb"},
|
||||||
{0, NULL}
|
{0, NULL}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -74,7 +76,7 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p
|
|||||||
mca_param_indices->algorithm_param_index =
|
mca_param_indices->algorithm_param_index =
|
||||||
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||||
"scatter_algorithm",
|
"scatter_algorithm",
|
||||||
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial.",
|
"Which scatter algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 binomial, 3 non-blocking linear.",
|
||||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||||
OPAL_INFO_LVL_5,
|
OPAL_INFO_LVL_5,
|
||||||
MCA_BASE_VAR_SCOPE_ALL,
|
MCA_BASE_VAR_SCOPE_ALL,
|
||||||
@ -114,6 +116,38 @@ ompi_coll_tuned_scatter_intra_check_forced_init(coll_tuned_force_algorithm_mca_p
|
|||||||
MCA_BASE_VAR_SCOPE_ALL,
|
MCA_BASE_VAR_SCOPE_ALL,
|
||||||
&coll_tuned_scatter_chain_fanout);
|
&coll_tuned_scatter_chain_fanout);
|
||||||
|
|
||||||
|
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||||
|
"scatter_min_procs",
|
||||||
|
"use basic linear algorithm for communicators larger than this value",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_6,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&ompi_coll_tuned_scatter_min_procs);
|
||||||
|
|
||||||
|
(void)mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||||
|
"scatter_algorithm_max_requests",
|
||||||
|
"Issue a blocking send every this many non-blocking requests. Only has meaning for non-blocking linear algorithm.",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||||
|
OPAL_INFO_LVL_5,
|
||||||
|
MCA_BASE_VAR_SCOPE_ALL,
|
||||||
|
&ompi_coll_tuned_scatter_blocking_send_ratio);
|
||||||
|
|
||||||
|
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||||
|
"scatter_intermediate_msg",
|
||||||
|
"use non-blocking linear algorithm for messages larger than this value",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_6,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&ompi_coll_tuned_scatter_intermediate_msg);
|
||||||
|
|
||||||
|
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
|
||||||
|
"scatter_large_msg",
|
||||||
|
"use linear algorithm for messages larger than this value",
|
||||||
|
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_6,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&ompi_coll_tuned_scatter_large_msg);
|
||||||
|
|
||||||
return (MPI_SUCCESS);
|
return (MPI_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -144,6 +178,11 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, int scount,
|
|||||||
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
|
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||||
rbuf, rcount, rdtype,
|
rbuf, rcount, rdtype,
|
||||||
root, comm, module);
|
root, comm, module);
|
||||||
|
case (3):
|
||||||
|
return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
|
||||||
|
rbuf, rcount, rdtype,
|
||||||
|
root, comm, module,
|
||||||
|
ompi_coll_tuned_scatter_blocking_send_ratio);
|
||||||
} /* switch */
|
} /* switch */
|
||||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||||
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
"coll:tuned:scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user