From 08618845a4de7ce7010f3ad0cb65aa7b4bf39fe1 Mon Sep 17 00:00:00 2001 From: Artem Polyakov Date: Mon, 19 Sep 2016 10:16:50 +0300 Subject: [PATCH] ompi/mpi_init: fix barrier Relax CPU usage pressure from the application processes when doing modex and barrier in ompi_mpi_init. We see significant latencies in SLURM/pmix plugin barrier progress because app processes are aggressively call opal_progress pushing away daemon process doing collective progress. --- ompi/runtime/ompi_mpi_init.c | 20 +++++++++++++++++--- ompi/runtime/ompi_mpi_params.c | 9 +++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 366b5a6b30..f62caa81e4 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -280,6 +280,7 @@ opal_list_t ompi_registered_datareps = {{0}}; bool ompi_enable_timing = false, ompi_enable_timing_ext = false; extern bool ompi_mpi_yield_when_idle; +extern bool ompi_mpi_lazy_wait_in_init; extern int ompi_mpi_event_tick_rate; /** @@ -532,7 +533,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) opal_pmix.register_evhandler(NULL, &info, ompi_errhandler_callback, ompi_errhandler_registration_callback, (void*)&errtrk); - OMPI_WAIT_FOR_COMPLETION(errtrk.active); + if( ompi_mpi_lazy_wait_in_init ){ + OMPI_LAZY_WAIT_FOR_COMPLETION(errtrk.active); + } else { + OMPI_WAIT_FOR_COMPLETION(errtrk.active); + } + OPAL_LIST_DESTRUCT(&info); if (OPAL_SUCCESS != errtrk.status) { error = "Error handler registration"; @@ -658,7 +664,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) if (NULL != opal_pmix.fence_nb) { opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data, fence_release, (void*)&active); - OMPI_WAIT_FOR_COMPLETION(active); + if( ompi_mpi_lazy_wait_in_init ){ + OMPI_LAZY_WAIT_FOR_COMPLETION(active); + } else { + OMPI_WAIT_FOR_COMPLETION(active); + } } else { opal_pmix.fence(NULL, opal_pmix_collect_all_data); } @@ -835,7 +845,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) if (NULL != opal_pmix.fence_nb) { opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data, fence_release, (void*)&active); - OMPI_WAIT_FOR_COMPLETION(active); + if( ompi_mpi_lazy_wait_in_init ){ + OMPI_LAZY_WAIT_FOR_COMPLETION(active); + } else { + OMPI_WAIT_FOR_COMPLETION(active); + } } else { opal_pmix.fence(NULL, opal_pmix_collect_all_data); } diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index ab02e20fec..54966d4015 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -60,6 +60,7 @@ bool ompi_have_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE); bool ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(OMPI_GROUP_SPARSE); bool ompi_mpi_yield_when_idle = true; +bool ompi_mpi_lazy_wait_in_init = false; int ompi_mpi_event_tick_rate = -1; char *ompi_mpi_show_mca_params_string = NULL; bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE); @@ -112,6 +113,14 @@ int ompi_mpi_register_params(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_mpi_yield_when_idle); + ompi_mpi_lazy_wait_in_init = false; + (void) mca_base_var_register("ompi", "mpi", NULL, "lazy_wait_in_init", + "Avoid aggressive progress in MPI_Init, make sure that PMIx server has timeslots to progress", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mpi_lazy_wait_in_init); + ompi_mpi_event_tick_rate = -1; (void) mca_base_var_register("ompi", "mpi", NULL, "event_tick_rate", "How often to progress TCP communications (0 = never, otherwise specified in microseconds)",