From 54b40aef9154c3ff5ca1d7531c4da49bece488c7 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@voltaire.com>
Date: Sun, 1 Jul 2007 11:34:23 +0000
Subject: [PATCH] Schedule SEND traffic of pipeline protocol between BTLs in
 accordance with relative bandwidths of each BTL. Precalculate what part of a
 message should be send via each BTL in advance instead of doing it during
 scheduling.

This commit was SVN r15248.
---
 ompi/mca/bml/r2/bml_r2.c             | 13 +++++++
 ompi/mca/pml/ob1/pml_ob1.c           | 12 +++++++
 ompi/mca/pml/ob1/pml_ob1.h           | 52 ++++++++++++++++++++++++++--
 ompi/mca/pml/ob1/pml_ob1_component.c | 11 +++---
 ompi/mca/pml/ob1/pml_ob1_rdma.c      | 42 +++-------------------
 ompi/mca/pml/ob1/pml_ob1_rdma.h      | 15 ++------
 ompi/mca/pml/ob1/pml_ob1_recvreq.h   |  2 +-
 ompi/mca/pml/ob1/pml_ob1_sendreq.c   | 51 ++++++++++++++++-----------
 ompi/mca/pml/ob1/pml_ob1_sendreq.h   |  5 ++-
 9 files changed, 123 insertions(+), 80 deletions(-)

diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c
index 61f5db8281..97f5c8aab8 100644
--- a/ompi/mca/bml/r2/bml_r2.c
+++ b/ompi/mca/bml/r2/bml_r2.c
@@ -157,6 +157,14 @@ static int mca_bml_r2_add_btls( void )
     return OMPI_SUCCESS;
 }
 
+static int btl_bandwidth_compare(const void *v1, const void *v2)
+{
+    mca_bml_base_btl_t *b1 = (mca_bml_base_btl_t*)v1,
+                       *b2 = (mca_bml_base_btl_t*)v2;
+
+    return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth;
+}
+
 /*
  *   For each proc setup a datastructure that indicates the PTLs
  *   that can be used to reach the destination.
@@ -380,6 +388,11 @@ int mca_bml_r2_add_procs(
          *     start using the weight to compute the correct amount.
          */
         n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); 
+        
+        /* sort BTLs in descending order according to bandwidth value */
+        qsort(bml_endpoint->btl_send.bml_btls, n_size,
+                sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
+
         bml_endpoint->bml_max_send_length = 0;
         bml_endpoint->bml_max_rdma_length = 0;
         bml_endpoint->btl_rdma_index = 0;
diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c
index fd528fadb7..8b06c99a84 100644
--- a/ompi/mca/pml/ob1/pml_ob1.c
+++ b/ompi/mca/pml/ob1/pml_ob1.c
@@ -536,3 +536,15 @@ int mca_pml_ob1_ft_event( int state )
 
     return OMPI_SUCCESS;
 }
+
+int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2)
+{
+    const mca_pml_ob1_com_btl_t *b1 = v1, *b2 = v2;
+
+    if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight)
+        return 1;
+    if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight)
+        return -1;
+
+    return 0;
+}
diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h
index a9d42899ac..d10b6a3235 100644
--- a/ompi/mca/pml/ob1/pml_ob1.h
+++ b/ompi/mca/pml/ob1/pml_ob1.h
@@ -56,6 +56,7 @@ struct mca_pml_ob1_t {
     size_t recv_pipeline_depth;
     size_t rdma_put_retries_limit;
     int max_rdma_per_request;
+    int max_send_per_range;
     bool leave_pinned; 
     int leave_pinned_pipeline;
     
@@ -231,8 +232,6 @@ extern int mca_pml_ob1_ft_event(
     MCA_BML_BASE_BTL_DES_ALLOC(bml_btl, des, order,                     \
    sizeof(mca_pml_ob1_hdr_t) + (sizeof(mca_btl_base_segment_t) << 4), size)
 
-#define MCA_PML_OB1_MAX_REGISTRATIONS 4
-
 struct mca_pml_ob1_pckt_pending_t {
     ompi_free_list_item_t super;
     ompi_proc_t* proc;
@@ -316,4 +315,53 @@ do {                                                                        \
    length -= hdrlen;                                                        \
 } while(0)
 
+/* represent BTL chosen for sending request */
+struct mca_pml_ob1_com_btl_t {
+    mca_bml_base_btl_t *bml_btl;
+    struct mca_mpool_base_registration_t* btl_reg;
+    size_t length;
+};
+typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;
+
+int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2);
+
+/* Calculate what percentage of a message to send through each BTL according to
+ * relative weight */
+static inline void mca_pml_ob1_calc_weighted_length(
+        mca_pml_ob1_com_btl_t *btls, int num_btls, size_t size,
+        double weight_total)
+{
+    int i;
+    size_t length_left = size;
+
+    /* shortcut for common case for only one BTL */
+    if(num_btls == 1) {
+        btls[0].length = size;
+        return;
+    }
+
+    /* sort BTLs according of their weights so BTLs with smaller weight will
+     * not hijack all of the traffic */
+    qsort(btls, num_btls, sizeof(mca_pml_ob1_com_btl_t),
+            mca_pml_ob1_com_btl_comp);
+
+    for(i = 0; i < num_btls; i++) {
+        mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
+        size_t length = 0;
+        if(length_left != 0) {
+            length = (length_left > bml_btl->btl_eager_limit)?
+                ((size_t)(size * (bml_btl->btl_weight / weight_total))) :
+                length_left;
+
+            if(length > length_left)
+                length = length_left;
+            length_left -= length;
+        }
+        btls[i].length = length;
+    }
+
+    /* account for rounding errors */
+    btls[0].length += length_left;
+}
+
 #endif
diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c
index dd7023597c..b35f4ee3a2 100644
--- a/ompi/mca/pml/ob1/pml_ob1_component.c
+++ b/ompi/mca/pml/ob1/pml_ob1_component.c
@@ -115,6 +115,8 @@ int mca_pml_ob1_component_open(void)
         mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5);
     mca_pml_ob1.max_rdma_per_request =
         mca_pml_ob1_param_register_int("max_rdma_per_request", 4);
+    mca_pml_ob1.max_send_per_range =
+        mca_pml_ob1_param_register_int("max_send_per_range", 4);
 
     mca_pml_ob1.unexpected_limit =
         mca_pml_ob1_param_register_int("unexpected_limit", 128);
@@ -149,7 +151,7 @@ int mca_pml_ob1_component_open(void)
     ompi_free_list_init(
         &mca_pml_ob1.send_requests,
         sizeof(mca_pml_ob1_send_request_t) +
-        (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t),
+        (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t),
         OBJ_CLASS(mca_pml_ob1_send_request_t),
         mca_pml_ob1.free_list_num,
         mca_pml_ob1.free_list_max,
@@ -160,7 +162,7 @@ int mca_pml_ob1_component_open(void)
     ompi_free_list_init(
         &mca_pml_ob1.recv_requests,
         sizeof(mca_pml_ob1_recv_request_t) +
-        (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_rdma_btl_t),
+        (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t),
         OBJ_CLASS(mca_pml_ob1_recv_request_t),
         mca_pml_ob1.free_list_num,
         mca_pml_ob1.free_list_max,
@@ -189,8 +191,6 @@ int mca_pml_ob1_component_open(void)
         mca_pml_ob1.free_list_inc,
         NULL);
                                                                                                             
-        
-
     OBJ_CONSTRUCT(&mca_pml_ob1.pending_pckts, ompi_free_list_t);
     ompi_free_list_init(
         &mca_pml_ob1.pending_pckts,
@@ -206,7 +206,8 @@ int mca_pml_ob1_component_open(void)
     OBJ_CONSTRUCT(&mca_pml_ob1.send_ranges, ompi_free_list_t);
     ompi_free_list_init(
         &mca_pml_ob1.send_ranges,
-        sizeof(mca_pml_ob1_send_range_t),
+        sizeof(mca_pml_ob1_send_range_t) +
+        (mca_pml_ob1.max_send_per_range - 1) * sizeof(mca_pml_ob1_com_btl_t),
         OBJ_CLASS(mca_pml_ob1_send_range_t),
         mca_pml_ob1.free_list_num,
         mca_pml_ob1.free_list_max,
diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c
index b1a2e24d9d..4e64a8729e 100644
--- a/ompi/mca/pml/ob1/pml_ob1_rdma.c
+++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c
@@ -35,39 +35,6 @@
  * from case when registration is not needed */
 static mca_mpool_base_registration_t pml_ob1_dummy_reg;
 
-/* Calculate what percentage of a message to send through each BTL according to
- * relative weight */
-static inline void calc_weighted_length(mca_pml_ob1_rdma_btl_t *rdma_btls,
-        int num_btls, size_t size, double weight_total)
-{
-    int i;
-    size_t length_left = size;
-
-    /* shortcut for common case for only one BTL */
-    if(num_btls == 1) {
-        rdma_btls[0].length = size;
-        return;
-    }
-
-    for(i = 0; i < num_btls; i++) {
-        mca_bml_base_btl_t* bml_btl = rdma_btls[i].bml_btl;
-        size_t length = 0;
-        if(length_left != 0) {
-            length = (length_left > bml_btl->btl_eager_limit)?
-                ((size_t)(size * (bml_btl->btl_weight / weight_total))) :
-                length_left;
-
-            if(length > length_left)
-                length = length_left;
-            length_left -= length;
-        }
-        rdma_btls[i].length = length;
-    }
-
-    /* account for rounding errors */
-    rdma_btls[0].length += length_left;
-}
-
 /*
  * Check to see if memory is registered or can be registered. Build a 
  * set of registrations on the request.
@@ -77,7 +44,7 @@ size_t mca_pml_ob1_rdma_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
     size_t size,
-    mca_pml_ob1_rdma_btl_t* rdma_btls)
+    mca_pml_ob1_com_btl_t* rdma_btls)
 {
     int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
     double weight_total = 0;
@@ -127,7 +94,8 @@ size_t mca_pml_ob1_rdma_btls(
     if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
         return 0;
 
-    calc_weighted_length(rdma_btls, num_btls_used, size, weight_total);
+    mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
+            weight_total);
 
     bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
     return num_btls_used;
@@ -136,7 +104,7 @@ size_t mca_pml_ob1_rdma_btls(
 size_t mca_pml_ob1_rdma_pipeline_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     size_t size,
-    mca_pml_ob1_rdma_btl_t* rdma_btls)
+    mca_pml_ob1_com_btl_t* rdma_btls)
 {
     int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
     double weight_total = 0;
@@ -152,7 +120,7 @@ size_t mca_pml_ob1_rdma_pipeline_btls(
         weight_total += rdma_btls[i].bml_btl->btl_weight;
     }
 
-    calc_weighted_length(rdma_btls, i, size, weight_total);
+    mca_pml_ob1_calc_weighted_length(rdma_btls, i, size, weight_total);
 
     return i;
 }
diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.h b/ompi/mca/pml/ob1/pml_ob1_rdma.h
index abd07e248c..3ed0655795 100644
--- a/ompi/mca/pml/ob1/pml_ob1_rdma.h
+++ b/ompi/mca/pml/ob1/pml_ob1_rdma.h
@@ -24,29 +24,18 @@
 
 struct mca_bml_base_endpoint_t;
 
-/**
- * structure to associate RDMA capable BTL(s) with corresponding registration
- */
-
-struct mca_pml_ob1_rdma_btl_t {
-    struct mca_bml_base_btl_t* bml_btl; 
-    struct mca_mpool_base_registration_t* btl_reg;
-    size_t length;
-};
-typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t;
-
 /*
  * Of the set of available btls that support RDMA,
  * find those that already have registrations - or
  * register if required (for leave_pinned option)
  */
 size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
-    unsigned char* base, size_t size, struct mca_pml_ob1_rdma_btl_t* btls);
+    unsigned char* base, size_t size, struct mca_pml_ob1_com_btl_t* btls);
 
 /* Choose RDMA BTLs to use for sending of a request by pipeline protocol.
  * Calculate number of bytes to send through each BTL according to available
  * bandwidth */
 size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
-                size_t size, mca_pml_ob1_rdma_btl_t* rdma_btls);
+                size_t size, mca_pml_ob1_com_btl_t* rdma_btls);
 #endif
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
index 1239c143fb..1c67d1d764 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@@ -52,7 +52,7 @@ struct mca_pml_ob1_recv_request_t {
     uint32_t req_rdma_idx;
     bool req_pending;
     bool req_ack_sent; /**< whether ack was sent to the sender */
-    mca_pml_ob1_rdma_btl_t req_rdma[1];
+    mca_pml_ob1_com_btl_t req_rdma[1];
 };
 typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t;
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index 31549bc0cc..94923f1623 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -883,7 +883,10 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
 {
     mca_pml_ob1_send_range_t *sr;
     ompi_free_list_item_t *i;
-    int rc = OMPI_SUCCESS;
+    mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint;
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    int rc = OMPI_SUCCESS, n;
+    double weight_total = 0;
 
     if(0 == send_length)
         return;
@@ -894,6 +897,18 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
 
     sr->range_send_offset = send_offset;
     sr->range_send_length = send_length;
+    sr->range_btl_idx = 0;
+
+    for(n = 0; n < num_btls && n < mca_pml_ob1.max_send_per_range; n++) {
+        sr->range_btls[n].bml_btl =
+            mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send);
+        weight_total += sr->range_btls[n].bml_btl->btl_weight;
+    }
+
+    sr->range_btl_cnt = n;
+    mca_pml_ob1_calc_weighted_length(sr->range_btls, n, send_length,
+            weight_total);
+
     OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
     opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr);
     OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
@@ -910,22 +925,19 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
 int mca_pml_ob1_send_request_schedule_exclusive(
         mca_pml_ob1_send_request_t* sendreq)
 { 
-    mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint;
-    size_t num_btl_avail =
-        mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
 
     do {
-        size_t prev_bytes_remaining = 0, num_fail = 0;
+        size_t prev_bytes_remaining = 0;
         mca_pml_ob1_send_range_t *range = NULL;
+        int num_fail = 0;
 
         while(true) {
             mca_pml_ob1_frag_hdr_t* hdr;
             mca_btl_base_descriptor_t* des;
-            int rc;
+            int rc, btl_idx;
             size_t size, offset;
             opal_list_item_t *item;
-            mca_bml_base_btl_t* bml_btl =
-                mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send);
+            mca_bml_base_btl_t* bml_btl;
 
             if(NULL == range || 0 == range->range_send_length) {
                 OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
@@ -957,7 +969,7 @@ int mca_pml_ob1_send_request_schedule_exclusive(
 
             prev_bytes_remaining = range->range_send_length;
 
-            if (num_fail == num_btl_avail) {
+            if (num_fail == range->range_btl_cnt) {
                 assert(sendreq->req_pending == MCA_PML_OB1_SEND_PENDING_NONE);
                 OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
                 sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_SCHEDULE;
@@ -967,17 +979,13 @@ int mca_pml_ob1_send_request_schedule_exclusive(
                 return OMPI_ERR_OUT_OF_RESOURCE;
             }
 
-            if(num_btl_avail == 1 ||
-                    range->range_send_length < bml_btl->btl_min_send_size) {
-                size = range->range_send_length;
-            } else {
-                /* otherwise attempt to give the BTL a percentage of the message
-                 * based on a weighting factor. for simplicity calculate this as
-                 * a percentage of the overall message length (regardless of
-                 * amount previously assigned)
-                 */
-                size = (size_t)(bml_btl->btl_weight * range->range_send_length);
-            } 
+            do {
+                btl_idx = range->range_btl_idx;
+                bml_btl = range->range_btls[btl_idx].bml_btl;
+                size = range->range_btls[btl_idx].length;
+                if(++range->range_btl_idx == range->range_btl_cnt)
+                    range->range_btl_idx = 0;
+            } while(!size);
 
             /* makes sure that we don't exceed BTL max send size */
             if (bml_btl->btl_max_send_size != 0 &&
@@ -1035,8 +1043,9 @@ int mca_pml_ob1_send_request_schedule_exclusive(
             rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML);
                 
             if(rc == OMPI_SUCCESS) {
-                range->range_send_length -= size;
                 /* update state */
+                range->range_btls[btl_idx].length -= size;
+                range->range_send_length -= size;
                 range->range_send_offset += size;
                 OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
             } else { 
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
index 809a4d4cba..f1394c0d47 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@@ -58,7 +58,7 @@ struct mca_pml_ob1_send_request_t {
     mca_pml_ob1_send_pending_t req_pending;
     opal_mutex_t req_send_range_lock; 
     opal_list_t req_send_ranges;
-    mca_pml_ob1_rdma_btl_t req_rdma[1]; 
+    mca_pml_ob1_com_btl_t req_rdma[1]; 
 };
 typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
 
@@ -68,6 +68,9 @@ struct mca_pml_ob1_send_range_t {
     ompi_free_list_item_t base;
     uint64_t range_send_offset;
     uint64_t range_send_length;
+    int range_btl_idx;
+    int range_btl_cnt;
+    mca_pml_ob1_com_btl_t range_btls[1];
 };
 typedef struct mca_pml_ob1_send_range_t mca_pml_ob1_send_range_t;
 OBJ_CLASS_DECLARATION(mca_pml_ob1_send_range_t);