1
1

COLL/TUNED: Update alltoall selection rule for mlx

Use linear with sync alltoall algorithm for certain message/comm size
ranges. Does not affect default fixed decision, unless HPCX (with its
custom parameters) is used or corresponding mca is set.

Signed-off-by: Mikhail Brinskii <mikhailb@mellanox.com>
(cherry picked from commit 404c4800688548b021bda68bdf10792424e6b1c5)
Этот коммит содержится в:
Mikhail Brinskii 2019-07-02 16:04:03 +03:00
родитель 86794e5b33
Коммит 3d5b7b4a1b
5 изменённых файлов: 41 добавлений и 7 удалений

Просмотреть файл

@ -78,3 +78,8 @@ opal_event_include=epoll
bml_r2_show_unreach_errors = 0
# alltoall algorithm selection settings for tuned coll mca
coll_tuned_alltoall_large_msg = 250000
coll_tuned_alltoall_min_procs = 2048
coll_tuned_alltoall_algorithm_max_requests = 8

Просмотреть файл

@ -38,6 +38,9 @@ extern int ompi_coll_tuned_init_chain_fanout;
extern int ompi_coll_tuned_init_max_requests;
extern int ompi_coll_tuned_alltoall_small_msg;
extern int ompi_coll_tuned_alltoall_intermediate_msg;
extern int ompi_coll_tuned_alltoall_large_msg;
extern int ompi_coll_tuned_alltoall_min_procs;
extern int ompi_coll_tuned_alltoall_max_reqs;
/* forced algorithm choices */
/* this structure is for storing the indexes to the forced algorithm mca params... */

Просмотреть файл

@ -28,7 +28,6 @@
/* alltoall algorithm variables */
static int coll_tuned_alltoall_forced_algorithm = 0;
static int coll_tuned_alltoall_segment_size = 0;
static int coll_tuned_alltoall_max_requests;
static int coll_tuned_alltoall_tree_fanout;
static int coll_tuned_alltoall_chain_fanout;
@ -115,7 +114,22 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm
MCA_BASE_VAR_SCOPE_ALL,
&coll_tuned_alltoall_chain_fanout);
coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_large_msg",
"threshold (if supported) to decide if large MSGs alltoall algorithm will be used",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_coll_tuned_alltoall_large_msg);
(void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_min_procs",
"threshold (if supported) to decide if many processes alltoall algorithm will be used",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_coll_tuned_alltoall_min_procs);
mca_param_indices->max_requests_param_index =
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
"alltoall_algorithm_max_requests",
@ -123,17 +137,16 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_ALL,
&coll_tuned_alltoall_max_requests);
&ompi_coll_tuned_alltoall_max_reqs);
if (mca_param_indices->max_requests_param_index < 0) {
return mca_param_indices->max_requests_param_index;
}
if (coll_tuned_alltoall_max_requests < 0) {
if (ompi_coll_tuned_alltoall_max_reqs < 0) {
if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) {
opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to system level default %d \n",
ompi_coll_tuned_init_max_requests );
opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to 0 \n");
}
coll_tuned_alltoall_max_requests = 0;
ompi_coll_tuned_alltoall_max_reqs = 0;
}
return (MPI_SUCCESS);

Просмотреть файл

@ -57,6 +57,13 @@ int ompi_coll_tuned_init_max_requests = 128;
int ompi_coll_tuned_alltoall_small_msg = 200;
int ompi_coll_tuned_alltoall_intermediate_msg = 3000;
/* Set it to intermediate value by default, so it does not affect default
* algorithm selection. Changing this value will force using linear with sync
* algorithm on certain message sizes. */
int ompi_coll_tuned_alltoall_large_msg = 3000;
int ompi_coll_tuned_alltoall_min_procs = 0; /* not used by default */
int ompi_coll_tuned_alltoall_max_reqs = 0; /* no limit for alltoall by default */
/* forced alogrithm variables */
/* indices for the MCA parameters */
coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}};

Просмотреть файл

@ -136,6 +136,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(const void *sbuf, int scount,
return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module);
} else if ((block_dsize < (size_t) ompi_coll_tuned_alltoall_large_msg) &&
(communicator_size <= ompi_coll_tuned_alltoall_min_procs)) {
return ompi_coll_base_alltoall_intra_linear_sync(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module,
ompi_coll_tuned_alltoall_max_reqs);
}
return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype,