Allow multiple connections to be started simultaneously when doing the OOB

wireup. For small clusters or clusters with decent ARP lookup and connect times, this will have marginal impact. For systems with either bad ARP lookup times or long connect times, increasing this number to something much closer to SOMAXCONN (128 on most modern machines) will result in a faster OOB wireup. Don't set higher than SOMAXCONN or you can end up with lots of connect() retries and we'll end up slower. This commit was SVN r14742.
2007-05-23 21:35:44 +00:00 · 2007-05-23 21:35:44 +00:00 · 5f15becf4e
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@ -203,6 +203,11 @@ int ompi_mpi_register_params(void)
                                "wire-up the OOB system between MPI processes.",
                                false, false, 0, NULL);

+    mca_base_param_reg_int_name("mpi", "preconnect_oob_simultaneous",
+                                "Number of simultaneous outstanding "
+                                "OOB connections to allow during preconnect.",
+                                false, false, 4, NULL);
+
    /* Leave pinned parameter */

    mca_base_param_reg_int_name("mpi", "leave_pinned",
--- a/ompi/runtime/ompi_mpi_preconnect.c
+++ b/ompi/runtime/ompi_mpi_preconnect.c
@ -87,9 +87,9 @@ ompi_init_preconnect_mpi(void)
 int
 ompi_init_preconnect_oob(void)
 {
-    size_t world_size, next, prev, i, world_rank;
+    size_t world_size, next, prev, i, j, world_rank;
    ompi_proc_t **procs;
-    int ret, param, value = 0;
+    int ret, simultaneous, param, value = 0;
    struct iovec inmsg[1], outmsg[1];

    param = mca_base_param_find("mpi", NULL, "preconnect_oob");
@ -104,6 +104,12 @@ ompi_init_preconnect_oob(void)
    }
    if (0 == value) return OMPI_SUCCESS;

+    param = mca_base_param_find("mpi", NULL, "preconnect_oob_simultaneous");
+    if (OMPI_ERROR == param) return OMPI_SUCCESS;
+    ret = mca_base_param_lookup_int(param, &value);
+    if (OMPI_SUCCESS != ret) return OMPI_SUCCESS;
+    simultaneous = (value < 1) ? 1 : value;
+
    procs = ompi_proc_world(&world_size);

    inmsg[0].iov_base = outmsg[0].iov_base = NULL;
@ -123,24 +129,28 @@ ompi_init_preconnect_oob(void)
       This limits any "flooding" effect that can occur with other
       connection algorithms, which can overwhelm the out-of-band
       connection system, leading to poor performance and hangs. */
-    for (i = 1 ; i <= world_size / 2 ; ++i) {
-        next = (world_rank + i) % world_size;
-        prev = (world_rank - i + world_size) % world_size;
+    for (i = 1 ; i <= world_size / 2 ; i += simultaneous) {
+        for (j = 0 ; j < (size_t) simultaneous ; ++j) {
+            next = (world_rank + (i + j )) % world_size;
                    
-        /* sends do not wait for a match */
-        ret = orte_rml.send(&procs[next]->proc_name,
-                            outmsg,
-                            1,
-                            ORTE_RML_TAG_WIREUP,
-                            0);
-        if (ret < 0) return ret;
+            /* sends do not wait for a match */
+            ret = orte_rml.send(&procs[next]->proc_name,
+                                outmsg,
+                                1,
+                                ORTE_RML_TAG_WIREUP,
+                                0);
+            if (ret < 0) return ret;
+        }
+        for (j = 0 ; j < (size_t) simultaneous ; ++j) {
+            prev = (world_rank - (i + j) + world_size) % world_size;
                    
-        ret = orte_rml.recv(&procs[prev]->proc_name,
-                            inmsg,
-                            1,
-                            ORTE_RML_TAG_WIREUP,
-                            0);
-        if (ret < 0) return ret;
+            ret = orte_rml.recv(&procs[prev]->proc_name,
+                                inmsg,
+                                1,
+                                ORTE_RML_TAG_WIREUP,
+                                0);
+            if (ret < 0) return ret;
+        }
    }
    
    return OMPI_SUCCESS;