From 404c4800688548b021bda68bdf10792424e6b1c5 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Tue, 2 Jul 2019 16:04:03 +0300 Subject: [PATCH 1/2] COLL/TUNED: Update alltoall selection rule for mlx Use linear with sync alltoall algorithm for certain message/comm size ranges. Does not affect default fixed decision, unless HPCX (with its custom parameters) is used or corresponding mca is set. Signed-off-by: Mikhail Brinskii --- contrib/platform/mellanox/optimized.conf | 5 ++++ ompi/mca/coll/tuned/coll_tuned.h | 3 +++ .../coll/tuned/coll_tuned_alltoall_decision.c | 27 ++++++++++++++----- ompi/mca/coll/tuned/coll_tuned_component.c | 7 +++++ .../coll/tuned/coll_tuned_decision_fixed.c | 6 +++++ 5 files changed, 41 insertions(+), 7 deletions(-) diff --git a/contrib/platform/mellanox/optimized.conf b/contrib/platform/mellanox/optimized.conf index c58428cf6a..986db6f6b3 100644 --- a/contrib/platform/mellanox/optimized.conf +++ b/contrib/platform/mellanox/optimized.conf @@ -78,3 +78,8 @@ opal_event_include=epoll bml_r2_show_unreach_errors = 0 +# alltoall algorithm selection settings for tuned coll mca +coll_tuned_alltoall_large_msg = 250000 +coll_tuned_alltoall_min_procs = 2048 +coll_tuned_alltoall_algorithm_max_requests = 8 + diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index d4b201bc7a..7ae039c980 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -38,6 +38,9 @@ extern int ompi_coll_tuned_init_chain_fanout; extern int ompi_coll_tuned_init_max_requests; extern int ompi_coll_tuned_alltoall_small_msg; extern int ompi_coll_tuned_alltoall_intermediate_msg; +extern int ompi_coll_tuned_alltoall_large_msg; +extern int ompi_coll_tuned_alltoall_min_procs; +extern int ompi_coll_tuned_alltoall_max_reqs; /* forced algorithm choices */ /* this structure is for storing the indexes to the forced algorithm mca params... */ diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c index 2ef1e6b903..86c16be535 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c @@ -28,7 +28,6 @@ /* alltoall algorithm variables */ static int coll_tuned_alltoall_forced_algorithm = 0; static int coll_tuned_alltoall_segment_size = 0; -static int coll_tuned_alltoall_max_requests; static int coll_tuned_alltoall_tree_fanout; static int coll_tuned_alltoall_chain_fanout; @@ -115,7 +114,22 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm MCA_BASE_VAR_SCOPE_ALL, &coll_tuned_alltoall_chain_fanout); - coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */ + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "alltoall_large_msg", + "threshold (if supported) to decide if large MSGs alltoall algorithm will be used", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_alltoall_large_msg); + + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "alltoall_min_procs", + "threshold (if supported) to decide if many processes alltoall algorithm will be used", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_alltoall_min_procs); + mca_param_indices->max_requests_param_index = mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_algorithm_max_requests", @@ -123,17 +137,16 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &coll_tuned_alltoall_max_requests); + &ompi_coll_tuned_alltoall_max_reqs); if (mca_param_indices->max_requests_param_index < 0) { return mca_param_indices->max_requests_param_index; } - if (coll_tuned_alltoall_max_requests < 0) { + if (ompi_coll_tuned_alltoall_max_reqs < 0) { if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) { - opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to system level default %d \n", - ompi_coll_tuned_init_max_requests ); + opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to 0 \n"); } - coll_tuned_alltoall_max_requests = 0; + ompi_coll_tuned_alltoall_max_reqs = 0; } return (MPI_SUCCESS); diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index be0d14a988..3de5aedfe2 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -57,6 +57,13 @@ int ompi_coll_tuned_init_max_requests = 128; int ompi_coll_tuned_alltoall_small_msg = 200; int ompi_coll_tuned_alltoall_intermediate_msg = 3000; +/* Set it to intermediate value by default, so it does not affect default + * algorithm selection. Changing this value will force using linear with sync + * algorithm on certain message sizes. */ +int ompi_coll_tuned_alltoall_large_msg = 3000; +int ompi_coll_tuned_alltoall_min_procs = 0; /* not used by default */ +int ompi_coll_tuned_alltoall_max_reqs = 0; /* no limit for alltoall by default */ + /* forced alogrithm variables */ /* indices for the MCA parameters */ coll_tuned_force_algorithm_mca_param_indices_t ompi_coll_tuned_forced_params[COLLCOUNT] = {{0}}; diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index 0150fcc3b4..2518afee98 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -136,6 +136,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(const void *sbuf, int scount, return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); + } else if ((block_dsize < (size_t) ompi_coll_tuned_alltoall_large_msg) && + (communicator_size <= ompi_coll_tuned_alltoall_min_procs)) { + return ompi_coll_base_alltoall_intra_linear_sync(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, module, + ompi_coll_tuned_alltoall_max_reqs); } return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype, From 65618f8db848613c95cbe112033df94721d326a8 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Wed, 24 Jul 2019 10:23:38 +0000 Subject: [PATCH 2/2] COLL/TUNED: Minor var names/comments fixes Signed-off-by: Mikhail Brinskii --- ompi/mca/coll/tuned/coll_tuned.h | 2 +- ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c | 10 +++++----- ompi/mca/coll/tuned/coll_tuned_component.c | 10 +++++----- ompi/mca/coll/tuned/coll_tuned_decision_fixed.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index 7ae039c980..d6fc4b89bd 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -40,7 +40,7 @@ extern int ompi_coll_tuned_alltoall_small_msg; extern int ompi_coll_tuned_alltoall_intermediate_msg; extern int ompi_coll_tuned_alltoall_large_msg; extern int ompi_coll_tuned_alltoall_min_procs; -extern int ompi_coll_tuned_alltoall_max_reqs; +extern int ompi_coll_tuned_alltoall_max_requests; /* forced algorithm choices */ /* this structure is for storing the indexes to the forced algorithm mca params... */ diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c index 86c16be535..b63037e123 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c @@ -116,7 +116,7 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_large_msg", - "threshold (if supported) to decide if large MSGs alltoall algorithm will be used", + "use pairwise exchange algorithm for messages larger than this value", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -124,7 +124,7 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, "alltoall_min_procs", - "threshold (if supported) to decide if many processes alltoall algorithm will be used", + "use pairwise exchange algorithm for communicators larger than this value", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -137,16 +137,16 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, - &ompi_coll_tuned_alltoall_max_reqs); + &ompi_coll_tuned_alltoall_max_requests); if (mca_param_indices->max_requests_param_index < 0) { return mca_param_indices->max_requests_param_index; } - if (ompi_coll_tuned_alltoall_max_reqs < 0) { + if (ompi_coll_tuned_alltoall_max_requests < 0) { if( 0 == ompi_comm_rank( MPI_COMM_WORLD ) ) { opal_output( 0, "Maximum outstanding requests must be positive number greater than 1. Switching to 0 \n"); } - ompi_coll_tuned_alltoall_max_reqs = 0; + ompi_coll_tuned_alltoall_max_requests = 0; } return (MPI_SUCCESS); diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index 3de5aedfe2..25e9bc77a0 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -57,12 +57,12 @@ int ompi_coll_tuned_init_max_requests = 128; int ompi_coll_tuned_alltoall_small_msg = 200; int ompi_coll_tuned_alltoall_intermediate_msg = 3000; -/* Set it to intermediate value by default, so it does not affect default - * algorithm selection. Changing this value will force using linear with sync - * algorithm on certain message sizes. */ +/* Set it to the same value as intermediate msg by default, so it does not affect + * default algorithm selection. Changing this value will force using linear with + * sync algorithm on certain message sizes. */ int ompi_coll_tuned_alltoall_large_msg = 3000; -int ompi_coll_tuned_alltoall_min_procs = 0; /* not used by default */ -int ompi_coll_tuned_alltoall_max_reqs = 0; /* no limit for alltoall by default */ +int ompi_coll_tuned_alltoall_min_procs = 0; /* disable by default */ +int ompi_coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */ /* forced alogrithm variables */ /* indices for the MCA parameters */ diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index 2518afee98..97560c5c08 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -141,7 +141,7 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(const void *sbuf, int scount, return ompi_coll_base_alltoall_intra_linear_sync(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module, - ompi_coll_tuned_alltoall_max_reqs); + ompi_coll_tuned_alltoall_max_requests); } return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype,