Merge pull request #1740 from rhc54/topic/async

Add an experimental ability to skip the RTE barriers at the end of MPI_Init and the beginning of MPI_Finalize
2016-06-01 18:31:35 -07:00 · 2016-06-01 18:31:35 -07:00 · 3b68c1f8db
--- a/ompi/runtime/ompi_mpi_finalize.c
+++ b/ompi/runtime/ompi_mpi_finalize.c
@ -246,26 +246,28 @@ int ompi_mpi_finalize(void)
       del_procs behavior around May of 2014 (see
       https://svn.open-mpi.org/trac/ompi/ticket/4669#comment:4 for
       more details). */
-    if (NULL != opal_pmix.fence_nb) {
-        active = true;
-        /* Note that use of the non-blocking PMIx fence will
-         * allow us to lazily cycle calling
-         * opal_progress(), which will allow any other pending
-         * communications/actions to complete.  See
-         * https://github.com/open-mpi/ompi/issues/1576 for the
-         * original bug report. */
-        opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
-        OMPI_LAZY_WAIT_FOR_COMPLETION(active);
-    } else {
-        /* However, we cannot guarantee that the provided PMIx has
-         * fence_nb.  If it doesn't, then do the best we can: an MPI
-         * barrier on COMM_WORLD (which isn't the best because of the
-         * reasons cited above), followed by a blocking PMIx fence
-         * (which does not call opal_progress()). */
-        ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
-        comm->c_coll.coll_barrier(comm, comm->c_coll.coll_barrier_module);
+    if (!ompi_async_mpi_finalize) {
+        if (NULL != opal_pmix.fence_nb) {
+            active = true;
+            /* Note that use of the non-blocking PMIx fence will
+             * allow us to lazily cycle calling
+             * opal_progress(), which will allow any other pending
+             * communications/actions to complete.  See
+             * https://github.com/open-mpi/ompi/issues/1576 for the
+             * original bug report. */
+            opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
+            OMPI_LAZY_WAIT_FOR_COMPLETION(active);
+        } else {
+            /* However, we cannot guarantee that the provided PMIx has
+             * fence_nb.  If it doesn't, then do the best we can: an MPI
+             * barrier on COMM_WORLD (which isn't the best because of the
+             * reasons cited above), followed by a blocking PMIx fence
+             * (which does not call opal_progress()). */
+            ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
+            comm->c_coll.coll_barrier(comm, comm->c_coll.coll_barrier_module);

-        opal_pmix.fence(NULL, 0);
+            opal_pmix.fence(NULL, 0);
+        }
    }

    /* check for timing request - get stop time and report elapsed
--- a/ompi/runtime/ompi_mpi_init.c
+++ b/ompi/runtime/ompi_mpi_init.c
@ -819,14 +819,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
    /* wait for everyone to reach this point - this is a hard
     * barrier requirement at this time, though we hope to relax
     * it at a later point */
-    active = true;
-    opal_pmix.commit();
-    if (NULL != opal_pmix.fence_nb) {
-        opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
-                           fence_release, (void*)&active);
-        OMPI_WAIT_FOR_COMPLETION(active);
-    } else {
-        opal_pmix.fence(NULL, opal_pmix_collect_all_data);
+    if (!ompi_async_mpi_init) {
+        active = true;
+        if (NULL != opal_pmix.fence_nb) {
+            opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
+                               fence_release, (void*)&active);
+            OMPI_WAIT_FOR_COMPLETION(active);
+        } else {
+            opal_pmix.fence(NULL, opal_pmix_collect_all_data);
+        }
    }

    /* check for timing request - get stop time and report elapsed
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@ -14,7 +14,7 @@
 * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
 *                         reserved.
 * Copyright (c) 2013      NVIDIA Corporation.  All rights reserved.
- * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
+ * Copyright (c) 2013-2016 Intel, Inc. All rights reserved
 * Copyright (c) 2015      Mellanox Technologies, Inc.
 *                         All rights reserved.
 * $COPYRIGHT$
@ -65,6 +65,9 @@ char *ompi_mpi_show_mca_params_string = NULL;
 bool ompi_mpi_have_sparse_group_storage = !!(OMPI_GROUP_SPARSE);
 bool ompi_mpi_preconnect_mpi = false;

+bool ompi_async_mpi_init = false;
+bool ompi_async_mpi_finalize = false;
+
 #define OMPI_ADD_PROCS_CUTOFF_DEFAULT 0
 uint32_t ompi_add_procs_cutoff = OMPI_ADD_PROCS_CUTOFF_DEFAULT;
 bool ompi_mpi_dynamics_enabled = true;
@ -282,6 +285,22 @@ int ompi_mpi_register_params(void)
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_mpi_dynamics_enabled);

+    ompi_async_mpi_init = false;
+    (void) mca_base_var_register("ompi", "async", "mpi", "init",
+                                 "Do not perform a barrier at the end of MPI_Init",
+                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                 OPAL_INFO_LVL_9,
+                                 MCA_BASE_VAR_SCOPE_READONLY,
+                                 &ompi_async_mpi_init);
+
+    ompi_async_mpi_finalize = false;
+    (void) mca_base_var_register("ompi", "async", "mpi", "finalize",
+                                 "Do not perform a barrier at the beginning of MPI_Finalize",
+                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                 OPAL_INFO_LVL_9,
+                                 MCA_BASE_VAR_SCOPE_READONLY,
+                                 &ompi_async_mpi_finalize);
+
    value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
    if (0 <= value) {
        (void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_delay",
--- a/ompi/runtime/params.h
+++ b/ompi/runtime/params.h
@ -135,6 +135,13 @@ OMPI_DECLSPEC extern uint32_t ompi_add_procs_cutoff;
 */
 OMPI_DECLSPEC extern bool ompi_mpi_dynamics_enabled;

+/* EXPERIMENTAL: do not perform an RTE barrier at the end of MPI_Init */
+OMPI_DECLSPEC extern bool ompi_async_mpi_init;
+
+/* EXPERIMENTAL: do not perform an RTE barrier at the beginning of MPI_Finalize */
+OMPI_DECLSPEC extern bool ompi_async_mpi_finalize;
+
+
 /**
 * Register MCA parameters used by the MPI layer.
 *