From 4db3730a25993ec79c2df1f52f9cb5df13bdcff2 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 3 Dec 2015 10:46:11 -0500
Subject: [PATCH 1/5] Be consistent for atomic operations and add an entity of
 the same type.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 opal/include/opal/sys/atomic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opal/include/opal/sys/atomic.h b/opal/include/opal/sys/atomic.h
index 9aa0b40247..f0ed2e4806 100644
--- a/opal/include/opal/sys/atomic.h
+++ b/opal/include/opal/sys/atomic.h
@@ -447,7 +447,7 @@ int64_t opal_atomic_sub_64(volatile int64_t *addr, int64_t delta);
  */
 #if defined(DOXYGEN) || OPAL_ENABLE_DEBUG
 static inline size_t
-opal_atomic_add_size_t(volatile size_t *addr, int delta)
+opal_atomic_add_size_t(volatile size_t *addr, size_t delta)
 {
 #if SIZEOF_SIZE_T == 4
     return (size_t) opal_atomic_add_32((int32_t*) addr, delta);
@@ -458,7 +458,7 @@ opal_atomic_add_size_t(volatile size_t *addr, int delta)
 #endif
 }
 static inline size_t
-opal_atomic_sub_size_t(volatile size_t *addr, int delta)
+opal_atomic_sub_size_t(volatile size_t *addr, size_t delta)
 {
 #if SIZEOF_SIZE_T == 4
     return (size_t) opal_atomic_sub_32((int32_t*) addr, delta);

From c340da25864069c8b179abde44b8242a92ab2515 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 3 Dec 2015 10:47:00 -0500
Subject: [PATCH 2/5] A first cut at the large data problem with TCP. As long
 as the writev and readv support a sum larger than a uint32_t this version
 will work. For the other OSes a different patch is required. This patch is a
 slight modification of the one proposed by @ggouaillardet.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 opal/mca/btl/tcp/btl_tcp_frag.c | 17 ++++++++---------
 opal/mca/btl/tcp/btl_tcp_frag.h |  4 ++--
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c
index 75901e9c47..a0ff9ebb78 100644
--- a/opal/mca/btl/tcp/btl_tcp_frag.c
+++ b/opal/mca/btl/tcp/btl_tcp_frag.c
@@ -112,11 +112,11 @@ size_t mca_btl_tcp_frag_dump(mca_btl_tcp_frag_t* frag, char* msg, char* buf, siz
 
 bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
 {
-    ssize_t cnt = -1;
+    ssize_t cnt;
     size_t i, num_vecs;
 
     /* non-blocking write, but continue if interrupted */
-    while(cnt < 0) {
+    do {
         cnt = writev(sd, frag->iov_ptr, frag->iov_cnt);
         if(cnt < 0) {
             switch(opal_socket_errno) {
@@ -140,11 +140,11 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
                 return false;
             }
         }
-    }
+    } while(cnt < 0);
 
     /* if the write didn't complete - update the iovec state */
     num_vecs = frag->iov_cnt;
-    for(i=0; i<num_vecs; i++) {
+    for( i = 0; i < num_vecs; i++) {
         if(cnt >= (ssize_t)frag->iov_ptr->iov_len) {
             cnt -= frag->iov_ptr->iov_len;
             frag->iov_ptr++;
@@ -166,14 +166,14 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
 bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
 {
     mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint;
-    int i, num_vecs, dont_copy_data = 0;
     ssize_t cnt;
+    int32_t i, num_vecs, dont_copy_data = 0;
 
  repeat:
     num_vecs = frag->iov_cnt;
 #if MCA_BTL_TCP_ENDPOINT_CACHE
     if( 0 != btl_endpoint->endpoint_cache_length ) {
-        size_t length;
+        ssize_t length;
         /* It's strange at the first look but cnt have to be set to the full amount of data
          * available. After going to advance_iov_position we will use cnt to detect if there
          * is still some data pending.
@@ -208,8 +208,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
 #endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
 
     /* non-blocking read, but continue if interrupted */
-    cnt = -1;
-    while( cnt < 0 ) {
+    do {
         cnt = readv(sd, frag->iov_ptr, num_vecs);
         if( 0 < cnt ) goto advance_iov_position;
         if( cnt == 0 ) {
@@ -247,7 +246,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
             mca_btl_tcp_endpoint_close(btl_endpoint);
             return false;
         }
-    }
+    } while( cnt < 0 );
 
  advance_iov_position:
     /* if the read didn't complete - update the iovec state */
diff --git a/opal/mca/btl/tcp/btl_tcp_frag.h b/opal/mca/btl/tcp/btl_tcp_frag.h
index b73da8f6ed..e1b068502f 100644
--- a/opal/mca/btl/tcp/btl_tcp_frag.h
+++ b/opal/mca/btl/tcp/btl_tcp_frag.h
@@ -53,8 +53,8 @@ struct mca_btl_tcp_frag_t {
     mca_btl_tcp_hdr_t hdr;
     struct iovec iov[MCA_BTL_TCP_FRAG_IOVEC_NUMBER + 1];
     struct iovec *iov_ptr;
-    size_t iov_cnt;
-    size_t iov_idx;
+    uint32_t iov_cnt;
+    uint32_t iov_idx;
     size_t size;
     uint16_t next_step;
     int rc;

From 050bd3b6d73fe66c12de84696a9849a94dfd374c Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 3 Dec 2015 10:48:48 -0500
Subject: [PATCH 3/5] Make the pipeline depth an int instead of a size_t. While
 they are supposed to be unsigned, casting them to a signed value for all
 atomic operations is as errorprone as handling them as signed entities.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/pml/ob1/pml_ob1.h           |  4 ++--
 ompi/mca/pml/ob1/pml_ob1_component.c |  4 ++--
 ompi/mca/pml/ob1/pml_ob1_recvreq.c   |  8 ++++----
 ompi/mca/pml/ob1/pml_ob1_recvreq.h   | 12 ++++++------
 ompi/mca/pml/ob1/pml_ob1_sendreq.c   | 10 +++++-----
 ompi/mca/pml/ob1/pml_ob1_sendreq.h   | 10 +++++-----
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h
index 10162916c6..4826587564 100644
--- a/ompi/mca/pml/ob1/pml_ob1.h
+++ b/ompi/mca/pml/ob1/pml_ob1.h
@@ -55,8 +55,8 @@ struct mca_pml_ob1_t {
     int free_list_num;      /* initial size of free list */
     int free_list_max;      /* maximum size of free list */
     int free_list_inc;      /* number of elements to grow free list */
-    size_t send_pipeline_depth;
-    size_t recv_pipeline_depth;
+    int32_t send_pipeline_depth;
+    int32_t recv_pipeline_depth;
     size_t rdma_retries_limit;
     int max_rdma_per_request;
     int max_send_per_range;
diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c
index 6557bc2037..1bb14e179e 100644
--- a/ompi/mca/pml/ob1/pml_ob1_component.c
+++ b/ompi/mca/pml/ob1/pml_ob1_component.c
@@ -184,8 +184,8 @@ static int mca_pml_ob1_component_register(void)
     mca_pml_ob1_param_register_int("free_list_max", -1, &mca_pml_ob1.free_list_max);
     mca_pml_ob1_param_register_int("free_list_inc", 64, &mca_pml_ob1.free_list_inc);
     mca_pml_ob1_param_register_int("priority", 20, &mca_pml_ob1.priority);
-    mca_pml_ob1_param_register_sizet("send_pipeline_depth", 3, &mca_pml_ob1.send_pipeline_depth);
-    mca_pml_ob1_param_register_sizet("recv_pipeline_depth", 4, &mca_pml_ob1.recv_pipeline_depth);
+    mca_pml_ob1_param_register_int("send_pipeline_depth", 3, &mca_pml_ob1.send_pipeline_depth);
+    mca_pml_ob1_param_register_int("recv_pipeline_depth", 4, &mca_pml_ob1.recv_pipeline_depth);
 
     /* NTH: we can get into a live-lock situation in the RDMA failure path so disable
        RDMA retries for now. Falling back to send may suck but it is better than
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index 5b45410874..3cd4c0d8fb 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -190,7 +190,7 @@ static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t r
     mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
     mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
 
-    OPAL_THREAD_SUB_SIZE_T(&recvreq->req_pipeline_depth, 1);
+    OPAL_THREAD_ADD32(&recvreq->req_pipeline_depth, -1);
 
     MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
 
@@ -198,7 +198,7 @@ static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t r
         assert ((uint64_t) rdma_size == frag->rdma_length);
 
         /* check completion status */
-        OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size);
+        OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, rdma_size);
         if (recv_request_pml_complete_check(recvreq) == false &&
             recvreq->req_rdma_offset < recvreq->req_send_offset) {
             /* schedule additional rdma operations */
@@ -951,7 +951,7 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
     }
 
     while(bytes_remaining > 0 &&
-           recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
+          recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
         mca_pml_ob1_rdma_frag_t *frag = NULL;
         mca_btl_base_module_t *btl;
         int rc, rdma_idx;
@@ -1028,7 +1028,7 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
         if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
             /* update request state */
             recvreq->req_rdma_offset += size;
-            OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
+            OPAL_THREAD_ADD32(&recvreq->req_pipeline_depth, 1);
             recvreq->req_rdma[rdma_idx].length -= size;
             bytes_remaining -= size;
         } else {
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
index 6d57569323..82c4767d83 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@@ -41,12 +41,12 @@ BEGIN_C_DECLS
 struct mca_pml_ob1_recv_request_t {
     mca_pml_base_recv_request_t req_recv;
     opal_ptr_t remote_req_send;
-    int32_t req_lock;
-    size_t  req_pipeline_depth;
-    size_t  req_bytes_received;  /**< amount of data transferred into the user buffer */
-    size_t  req_bytes_expected; /**< local size of the data as suggested by the user */
-    size_t  req_rdma_offset;
-    size_t  req_send_offset;
+    int32_t  req_lock;
+    int32_t  req_pipeline_depth;
+    size_t   req_bytes_received;  /**< amount of data transferred into the user buffer */
+    size_t   req_bytes_expected; /**< local size of the data as suggested by the user */
+    size_t   req_rdma_offset;
+    size_t   req_send_offset;
     uint32_t req_rdma_cnt;
     uint32_t req_rdma_idx;
     bool req_pending;
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index f0a227f5dc..f358d733da 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -313,7 +313,7 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
                                                                    des->des_segment_count,
                                                                    sizeof(mca_pml_ob1_frag_hdr_t));
 
-    OPAL_THREAD_SUB_SIZE_T(&sendreq->req_pipeline_depth, 1);
+    OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, -1);
     OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
 
     if(send_request_pml_complete_check(sendreq) == false) {
@@ -913,13 +913,13 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq)
 
     /* check pipeline_depth here before attempting to get any locks */
     if(true == sendreq->req_throttle_sends &&
-            sendreq->req_pipeline_depth >= mca_pml_ob1.send_pipeline_depth)
+       sendreq->req_pipeline_depth >= mca_pml_ob1.send_pipeline_depth)
         return OMPI_SUCCESS;
 
     range = get_send_range(sendreq);
 
     while(range && (false == sendreq->req_throttle_sends ||
-            sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) {
+          sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) {
         mca_pml_ob1_frag_hdr_t* hdr;
         mca_btl_base_descriptor_t* des;
         int rc, btl_idx;
@@ -1044,7 +1044,7 @@ cannot_pack:
             range->range_btls[btl_idx].length -= size;
             range->range_send_length -= size;
             range->range_send_offset += size;
-            OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
+            OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, 1);
             if(range->range_send_length == 0) {
                 range = get_next_send_range(sendreq, range);
                 prev_bytes_remaining = 0;
@@ -1060,7 +1060,7 @@ cannot_pack:
             range->range_btls[btl_idx].length -= size;
             range->range_send_length -= size;
             range->range_send_offset += size;
-            OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
+            OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, 1);
             if(range->range_send_length == 0) {
                 range = get_next_send_range(sendreq, range);
                 prev_bytes_remaining = 0;
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
index 80acc93f4e..5cb21f6aba 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@@ -45,11 +45,11 @@ struct mca_pml_ob1_send_request_t {
     mca_pml_base_send_request_t req_send;
     mca_bml_base_endpoint_t* req_endpoint;
     opal_ptr_t req_recv;
-    int32_t req_state;
-    int32_t req_lock;
-    bool req_throttle_sends;
-    size_t req_pipeline_depth;
-    size_t req_bytes_delivered;
+    int32_t  req_state;
+    int32_t  req_lock;
+    bool     req_throttle_sends;
+    int32_t  req_pipeline_depth;
+    size_t   req_bytes_delivered;
     uint32_t req_rdma_cnt;
     mca_pml_ob1_send_pending_t req_pending;
     opal_mutex_t req_send_range_lock;

From 866899e8364cab4ba3d4e8fc5f286f3ecc791858 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Fri, 28 Apr 2017 01:17:52 -0400
Subject: [PATCH 4/5] Always abide to the RDMA pipeline limit.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/pml/ob1/pml_ob1_recvreq.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index 3cd4c0d8fb..bbc90e1e47 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -983,14 +983,10 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
         } while(!size);
         btl = bml_btl->btl;
 
-        /* NTH: This conditional used to check if there was a registration in
-         * recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to
-         * the btl not needed registration (equivalent to btl->btl_register_mem
-         * != NULL. This new check is equivalent. Note: I feel this protocol
-         * needs work to better improve resource usage when running with a
-         * leave pinned protocol. */
-        if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) &&
-            (size > btl->btl_rdma_pipeline_frag_size)) {
+         /* NTH: Note: I feel this protocol needs work to better improve resource
+          * usage when running with a leave pinned protocol. */
+        /* GB: We should always abide by the BTL RDMA pipeline fragment limit (if one is set) */
+        if ((btl->btl_rdma_pipeline_frag_size != 0) && (size > btl->btl_rdma_pipeline_frag_size)) {
             size = btl->btl_rdma_pipeline_frag_size;
         }
 

From d10522a01c448495905b413cdeb8b396b602d29c Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Fri, 28 Apr 2017 01:18:29 -0400
Subject: [PATCH 5/5] Set a hard limit on the TCP max fragment size. Some OSes
 have hardcoded limits to prevent overflowing over an int32_t. We can either
 detect this at configure (which might be a nicer but incomplete solution), or
 always force the pipelined protocol over TCP. As it only covers data larger
 than 1GB, no performance penalty is to be expected.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 opal/mca/btl/tcp/btl_tcp_component.c | 13 +++++++++++--
 opal/mca/btl/tcp/btl_tcp_frag.c      |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c
index bbaf27f422..40b1b41047 100644
--- a/opal/mca/btl/tcp/btl_tcp_component.c
+++ b/opal/mca/btl/tcp/btl_tcp_component.c
@@ -318,7 +318,12 @@ static int mca_btl_tcp_component_register(void)
     mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024;
     mca_btl_tcp_module.super.btl_max_send_size = 128*1024;
     mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024;
-    mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX;
+    /* Some OSes have hard coded limits on how many bytes can be manipulated
+     * by each writev operation.  Force a reasonable limit, to prevent overflowing
+     * a signed 32-bit integer (limit comes from BSD and OS X). We remove 1k to
+     * make some room for our internal headers.
+     */
+    mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024);
     mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0;
     mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT |
                                        MCA_BTL_FLAGS_SEND_INPLACE |
@@ -335,7 +340,11 @@ static int mca_btl_tcp_component_register(void)
 
     mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version,
                                 &mca_btl_tcp_module.super);
-
+    if (mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size > ((1UL<<31) - 1024) ) {
+        /* Assume a hard limit. A test in configure would be a better solution, but until then
+         * kicking-in the pipeline RDMA for extremely large data is good enough. */
+        mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024);
+    }
     mca_btl_tcp_param_register_int ("disable_family", NULL, 0, OPAL_INFO_LVL_2,  &mca_btl_tcp_component.tcp_disable_family);
 
     return mca_btl_tcp_component_verify();
diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c
index a0ff9ebb78..56775067c9 100644
--- a/opal/mca/btl/tcp/btl_tcp_frag.c
+++ b/opal/mca/btl/tcp/btl_tcp_frag.c
@@ -173,7 +173,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
     num_vecs = frag->iov_cnt;
 #if MCA_BTL_TCP_ENDPOINT_CACHE
     if( 0 != btl_endpoint->endpoint_cache_length ) {
-        ssize_t length;
+        size_t length;
         /* It's strange at the first look but cnt have to be set to the full amount of data
          * available. After going to advance_iov_position we will use cnt to detect if there
          * is still some data pending.