Clean up the preconnect code:

* Don't need the 2 process case -- we'll send an extra message, but at very little cost and less code is better. * Use COMPLETE sends instead of STANDARD sends so that the connection is fully established before we move on to the next connection. The previous code was still causing minor connection flooding for huge numbers of processes. * mpi_preconnect_all now connects both OOB and MPI layers. There's also mpi_preconnect_mpi and mpi_preconnect_oob should you want to be more specific. * Since we're only using the MCA parameters once at the beginning of time, no need for global constants. Just do the quick param lookup right before the parameter is needed. Save some of that global variable space for the next guy. Fixes trac:963 This commit was SVN r14553. The following Trac tickets were found above: Ticket 963 --> https://svn.open-mpi.org/trac/ompi/ticket/963
2007-05-01 04:49:36 +00:00 · 2007-05-01 04:49:36 +00:00 · a25ce44dc1
--- a/ompi/runtime/mpiruntime.h
+++ b/ompi/runtime/mpiruntime.h
@ -105,8 +105,8 @@ extern "C" {
     * Do a preconnect of MPI connections (i.e., force connections to
     * be made if they will be made).
     */
-    int ompi_init_do_preconnect(void);
-    int ompi_init_do_oob_preconnect(void);
+    int ompi_init_preconnect_oob(void);
+    int ompi_init_preconnect_mpi(void);

 #if defined(c_plusplus) || defined(__cplusplus)
 }
--- a/ompi/runtime/ompi_mpi_init.c
+++ b/ompi/runtime/ompi_mpi_init.c
@ -681,11 +681,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
    /* wire up the oob interface, if requested.  Do this here because
       it will go much faster before the event library is switched
       into non-blocking mode */
-    if (ompi_mpi_preconnect_oob) { 
-        if (OMPI_SUCCESS != (ret = ompi_init_do_oob_preconnect())) {
-            error = "ompi_mpi_do_preconnect_oob() failed";
-            goto error;
-        }
+    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_oob())) {
+        error = "ompi_mpi_do_preconnect_oob() failed";
+        goto error;
    }

    /* check for timing request - get stop time and report elapsed
@ -708,6 +706,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
       latency. */
    opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
 #endif
+    
+    /* wire up the mpi interface, if requested.  Do this after the
+       non-block switch for non-TCP performance.  Do before the
+       polling change as anyone with a complex wire-up is going to be
+       using the oob. */
+    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
+        error = "ompi_mpi_do_preconnect_all() failed";
+        goto error;
+    }

    /* Check whether we have been spawned or not.  We introduce that
       at the very end, since we need collectives, datatypes, ptls
@ -777,17 +784,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
    if (value >= 0) {
        opal_progress_set_event_poll_rate(value);
    }
-    
+
    /* At this point, we are fully configured and in MPI mode.  Any
       communication calls here will work exactly like they would in
       the user's code.  Setup the connections between procs and warm
       them up with simple sends, if requested */
-   if (ompi_mpi_preconnect_all) { 
-       if (OMPI_SUCCESS != (ret = ompi_init_do_preconnect())) {
-           error = "ompi_mpi_do_preconnect_all() failed";
-           goto error;
-       }
-    }

 error:
    if (ret != OMPI_SUCCESS) {
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@ -51,8 +51,6 @@ bool ompi_mpi_paffinity_alone = false;
 bool ompi_mpi_abort_print_stack = false;
 int ompi_mpi_abort_delay = 0;
 bool ompi_mpi_keep_peer_hostnames = true;
-bool ompi_mpi_preconnect_all = false;
-bool ompi_mpi_preconnect_oob = false;
 bool ompi_mpi_leave_pinned = false;
 bool ompi_mpi_leave_pinned_pipeline = false;

@ -187,18 +185,23 @@ int ompi_mpi_register_params(void)
 #endif

    mca_base_param_reg_int_name("mpi", "preconnect_all",
-                                "Whether to force MPI processes to create connections / warmup with *all* peers during MPI_INIT (vs. making connections lazily -- upon the first MPI traffic between each process peer pair)",
-                                false, false, 
-                                (int) ompi_mpi_preconnect_all, &value);
-    
-    ompi_mpi_preconnect_all = OPAL_INT_TO_BOOL(value); 
+                                "Whether to force MPI processes to create OOB "
+                                "and MPI connections with *all* peers during "
+                                "MPI_INIT (vs. making connections lazily -- "
+                                "upon the first MPI traffic between each "
+                                "process peer pair)",
+                                false, false, 0, NULL);
+
+    mca_base_param_reg_int_name("mpi", "preconnect_mpi",
+                                "Whether to force MPI processes to fully "
+                                "wire-up the MPI connections between MPI "
+                                "processes.",
+                                false, false, 0, NULL);

    mca_base_param_reg_int_name("mpi", "preconnect_oob",
-                                "Whether to force MPI processes to fully wire-up the OOB system between MPI processes.",
-                                false, false, 
-                                (int) ompi_mpi_preconnect_oob, &value);
-    
-    ompi_mpi_preconnect_oob = OPAL_INT_TO_BOOL(value); 
+                                "Whether to force MPI processes to fully "
+                                "wire-up the OOB system between MPI processes.",
+                                false, false, 0, NULL);

    /* Leave pinned parameter */

--- a/ompi/runtime/ompi_mpi_preconnect.c
+++ b/ompi/runtime/ompi_mpi_preconnect.c
@ -18,6 +18,7 @@

 #include <stdlib.h>

+#include "ompi/constants.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/request/request.h"
@ -25,126 +26,121 @@
 #include "orte/mca/rml/rml.h"
 #include "orte/mca/rml/rml_types.h"

-/* 
- * do zero byte IRECV / ISEND: upper half sends to lower half (i.e. do
- * a ping, not a ping pong)
- */ 
-int ompi_init_do_preconnect(void)
+int
+ompi_init_preconnect_mpi(void)
 {
    int comm_size = ompi_comm_size(MPI_COMM_WORLD);
-    int my_rank =  ompi_comm_rank(MPI_COMM_WORLD);
-    int i, ret;
-    int next,prev;
+    int comm_rank =  ompi_comm_rank(MPI_COMM_WORLD);
+    int param, value, next, prev, i, ret = OMPI_SUCCESS;
    struct ompi_request_t * requests[2];
-    ret = OMPI_SUCCESS;
-    if(comm_size == 2) {
-        if(my_rank){ 
-            ret = MCA_PML_CALL(send(MPI_BOTTOM, 0, MPI_BYTE,
-                                    0, 1,
-                                    MCA_PML_BASE_SEND_STANDARD,
-                                     MPI_COMM_WORLD));
-            if(OMPI_SUCCESS != ret) {
-                return ret;
-            }
-        } else {
-            ret = MCA_PML_CALL(recv(MPI_BOTTOM,0, MPI_BYTE, 1, 
-                                    1, MPI_COMM_WORLD, 
-                                    MPI_STATUS_IGNORE));
-            if(OMPI_SUCCESS != ret) {
-                return ret;
-            }
-        }   
-    } else { 
-        for (i = 1; i <= comm_size/2; ++i) {
-            next = (my_rank + i) % comm_size;
-            prev = (my_rank - i + comm_size) % comm_size;
-            ret = MCA_PML_CALL(irecv(MPI_BOTTOM,0, MPI_BYTE,
-                                     prev, 1,
-                                     MPI_COMM_WORLD, 
-                                     &requests[0]));
-            if(OMPI_SUCCESS != ret) {
-                return ret;
-            }
-            ret = MCA_PML_CALL(isend(MPI_BOTTOM, 0, MPI_BYTE,
-                                     next, 1,
-                                     MCA_PML_BASE_SEND_STANDARD,
-                                     MPI_COMM_WORLD, 
-                                     &requests[1]));
-            if (OMPI_SUCCESS != ret) {
-                return ret;
-            }
-            ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE);
-            if (OMPI_SUCCESS != ret) {
-                return ret;
-            }
-            
-        }
-        
+    char inbuf[1], outbuf[1];
+
+    param = mca_base_param_find("mpi", NULL, "preconnect_mpi");
+    if (OMPI_ERROR == param) return OMPI_SUCCESS;
+    ret = mca_base_param_lookup_int(param, &value);
+    if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
+    if (0 == value) {
+        param = mca_base_param_find("mpi", NULL, "preconnect_all");
+        if (OMPI_ERROR == param) return OMPI_SUCCESS;
+        ret = mca_base_param_lookup_int(param, &value);
+        if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
    }
-    
+    if (0 == value) return OMPI_SUCCESS;
+
+    inbuf[0] = outbuf[0] = '\0';
+
+    /* Each iteration, every process sends to its neighbor i hops to
+       the right and receives from its neighbor i hops to the left.
+       Because send_complete is used, there will only ever be one
+       outstanding send and one outstanding receive in the network at
+       a time for any given process.  This limits any "flooding"
+       effect that can occur with other connection algorithms.  While
+       the flooding algorithms may be a more efficient use of
+       resources, they can overwhelm the out-of-band connection system
+       used to wire up some networks, leading to poor performance and
+       hangs. */
+    for (i = 1 ; i <= comm_size / 2 ; ++i) {
+        next = (comm_rank + i) % comm_size;
+        prev = (comm_rank - i + comm_size) % comm_size;
+
+        ret = MCA_PML_CALL(isend(outbuf, 1, MPI_CHAR,
+                                 next, 1,
+                                 MCA_PML_BASE_SEND_COMPLETE,
+                                 MPI_COMM_WORLD, 
+                                 &requests[1]));
+        if (OMPI_SUCCESS != ret) return ret;
+
+        ret = MCA_PML_CALL(irecv(inbuf, 1, MPI_CHAR,
+                                 prev, 1,
+                                 MPI_COMM_WORLD, 
+                                 &requests[0]));
+        if(OMPI_SUCCESS != ret) return ret;
+
+        ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE);
+        if (OMPI_SUCCESS != ret) return ret;
+    }
+
    return ret;
 }

    
-int ompi_init_do_oob_preconnect(void)
+int
+ompi_init_preconnect_oob(void)
 {
-    size_t world_size, next, prev, i, my_index;
+    size_t world_size, next, prev, i, world_rank;
    ompi_proc_t **procs;
-    int ret;
-    struct iovec msg[1];
+    int ret, param, value = 0;
+    struct iovec inmsg[1], outmsg[1];
+
+    param = mca_base_param_find("mpi", NULL, "preconnect_oob");
+    if (OMPI_ERROR == param) return OMPI_SUCCESS;
+    ret = mca_base_param_lookup_int(param, &value);
+    if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
+    if (0 == value) {
+        param = mca_base_param_find("mpi", NULL, "preconnect_all");
+        if (OMPI_ERROR == param) return OMPI_SUCCESS;
+        ret = mca_base_param_lookup_int(param, &value);
+        if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
+    }
+    if (0 == value) return OMPI_SUCCESS;

    procs = ompi_proc_world(&world_size);

-    msg[0].iov_base = NULL;
-    msg[0].iov_len = 0;
+    inmsg[0].iov_base = outmsg[0].iov_base = NULL;
+    inmsg[0].iov_len = outmsg[0].iov_len = 0;

-    if (world_size == 2) {
-        if (ompi_proc_local() == procs[0]) {
-            ret = orte_rml.send(&procs[1]->proc_name,
-                                msg,
-                                1,
-                                ORTE_RML_TAG_WIREUP,
-                                0);
-            if (ret < 0) return ret;
-        } else {
-            ret = orte_rml.recv(&procs[0]->proc_name,
-                                msg,
-                                1,
-                                ORTE_RML_TAG_WIREUP,
-                                0);
-            if (ret < 0) return ret;
-        }
-    } else if (world_size > 2) {
-        /* find local index in proc list (which *should* be our rank
-           in MCW, aborting if not found */
-        for (i = 0 ; i < world_size ; ++i) {
-            if (ompi_proc_local() == procs[i]) {
-                my_index = i;
-                goto preconnect_stage;
-            }
-        }
-        /* I'm not on the list ? well then just complain */
+    /* proc_world and ompi_comm_world should have the same proc list... */
+    if ((int) world_size != ompi_comm_size(MPI_COMM_WORLD)) {
        return OMPI_ERR_NOT_FOUND;
-  preconnect_stage:
-        for (i = 1 ; i <= world_size / 2 ; ++i) {
-            next = (my_index + i) % world_size;
-            prev = (my_index - i + world_size) % world_size;
+    } else if (ompi_proc_local() !=
+               procs[ompi_comm_rank(MPI_COMM_WORLD)]) {
+        return OMPI_ERR_NOT_FOUND;
+    }
+    world_rank = (size_t) ompi_comm_rank(MPI_COMM_WORLD);
+
+    /* Each iteration, every process sends to its neighbor i hops to
+       the right and receives from its neighbor i hops to the left.
+       This limits any "flooding" effect that can occur with other
+       connection algorithms, which can overwhelm the out-of-band
+       connection system, leading to poor performance and hangs. */
+    for (i = 1 ; i <= world_size / 2 ; ++i) {
+        next = (world_rank + i) % world_size;
+        prev = (world_rank - i + world_size) % world_size;
                    
-            /* sends do not wait for a match */
-            ret = orte_rml.send(&procs[next]->proc_name,
-                                msg,
-                                1,
-                                ORTE_RML_TAG_WIREUP,
-                                0);
-            if (ret < 0) return ret;
+        /* sends do not wait for a match */
+        ret = orte_rml.send(&procs[next]->proc_name,
+                            outmsg,
+                            1,
+                            ORTE_RML_TAG_WIREUP,
+                            0);
+        if (ret < 0) return ret;
                    
-            ret = orte_rml.recv(&procs[prev]->proc_name,
-                                msg,
-                                1,
-                                ORTE_RML_TAG_WIREUP,
-                                0);
-            if (ret < 0) return ret;
-        }
+        ret = orte_rml.recv(&procs[prev]->proc_name,
+                            inmsg,
+                            1,
+                            ORTE_RML_TAG_WIREUP,
+                            0);
+        if (ret < 0) return ret;
    }
    
    return OMPI_SUCCESS;
--- a/ompi/runtime/params.h
+++ b/ompi/runtime/params.h
@ -21,9 +21,8 @@

 #ifndef OMPI_RUNTIME_PARAMS_H
 #define OMPI_RUNTIME_PARAMS_H
-#if defined(c_plusplus) || defined(__cplusplus)
-extern "C" {
-#endif
+
+BEGIN_C_DECLS

 /*
 * Global variables
@ -91,71 +90,58 @@ OMPI_DECLSPEC extern char * ompi_mpi_show_mca_params_file;
 */
 OMPI_DECLSPEC extern bool ompi_mpi_paffinity_alone;

-    /**
-     * Whether we should keep the string hostnames of all the MPI
-     * process peers around or not (eats up a good bit of memory).
-     */
-    OMPI_DECLSPEC extern bool ompi_mpi_keep_peer_hostnames;
+/**
+ * Whether we should keep the string hostnames of all the MPI
+ * process peers around or not (eats up a good bit of memory).
+ */
+OMPI_DECLSPEC extern bool ompi_mpi_keep_peer_hostnames;

-    /**
-     * Whether an MPI_ABORT should print out a stack trace or not.
-     */
-    OMPI_DECLSPEC extern bool ompi_mpi_abort_print_stack;
+/**
+ * Whether an MPI_ABORT should print out a stack trace or not.
+ */
+OMPI_DECLSPEC extern bool ompi_mpi_abort_print_stack;

-    /**
-     * Whether we should force all connections to be created during
-     * MPI_INIT (vs. potentially making all the connection lazily upon
-     * first communication with an MPI peer process).
-     */
-    OMPI_DECLSPEC extern bool ompi_mpi_preconnect_all;
+/**
+ * Whether  MPI_ABORT  should  print  out an  identifying  message
+ * (e.g., hostname  and PID)  and loop waiting  for a  debugger to
+ * attach.  The value of the integer is how many seconds to wait:
+ *
+ * 0 = do not print the message and do not loop
+ * negative value = print the message and loop forever
+ * positive value = print the message and delay for that many seconds
+ */
+OMPI_DECLSPEC extern int ompi_mpi_abort_delay;

-    /**
-     * should we wireup the oob completely during MPI_INIT?
-     */
-    OMPI_DECLSPEC extern bool ompi_mpi_preconnect_oob;
+/**
+ * Whether to use the "leave pinned" protocol or not.
+ */
+OMPI_DECLSPEC extern bool ompi_mpi_leave_pinned;

-    /**
-     * Whether  MPI_ABORT  should  print  out an  identifying  message
-     * (e.g., hostname  and PID)  and loop waiting  for a  debugger to
-     * attach.  The value of the integer is how many seconds to wait:
-     *
-     * 0 = do not print the message and do not loop
-     * negative value = print the message and loop forever
-     * positive value = print the message and delay for that many seconds
-     */
-    OMPI_DECLSPEC extern int ompi_mpi_abort_delay;
+/**
+ * Whether to use the "leave pinned pipeline" protocol or not.
+ */
+OMPI_DECLSPEC extern bool ompi_mpi_leave_pinned_pipeline;

-    /**
-     * Whether to use the "leave pinned" protocol or not.
-     */
-    OMPI_DECLSPEC extern bool ompi_mpi_leave_pinned;
-
-    /**
-     * Whether to use the "leave pinned pipeline" protocol or not.
-     */
-    OMPI_DECLSPEC extern bool ompi_mpi_leave_pinned_pipeline;
-
-   /**
-     * Register MCA parameters used by the MPI layer.
-     *
-     * @returns OMPI_SUCCESS
-     *
-     * Registers several MCA parameters and initializes corresponding
-     * global variables to the values obtained from the MCA system.
-     */
+/**
+ * Register MCA parameters used by the MPI layer.
+ *
+ * @returns OMPI_SUCCESS
+ *
+ * Registers several MCA parameters and initializes corresponding
+ * global variables to the values obtained from the MCA system.
+ */
 OMPI_DECLSPEC int ompi_mpi_register_params(void);


-    /**
-     * Display all MCA parameters used 
-     * 
-     * @returns OMPI_SUCCESS
-     *
-     * Displays in key = value format
-     */
-    int ompi_show_all_mca_params(int32_t, int, char *);
-#if defined(c_plusplus) || defined(__cplusplus)
-}
-#endif
+/**
+ * Display all MCA parameters used 
+ * 
+ * @returns OMPI_SUCCESS
+ *
+ * Displays in key = value format
+ */
+int ompi_show_all_mca_params(int32_t, int, char *);
+
+END_C_DECLS

 #endif /* OMPI_RUNTIME_PARAMS_H */