From 4db3730a25993ec79c2df1f52f9cb5df13bdcff2 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 3 Dec 2015 10:46:11 -0500 Subject: [PATCH 1/5] Be consistent for atomic operations and add an entity of the same type. Signed-off-by: George Bosilca --- opal/include/opal/sys/atomic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opal/include/opal/sys/atomic.h b/opal/include/opal/sys/atomic.h index 9aa0b40247..f0ed2e4806 100644 --- a/opal/include/opal/sys/atomic.h +++ b/opal/include/opal/sys/atomic.h @@ -447,7 +447,7 @@ int64_t opal_atomic_sub_64(volatile int64_t *addr, int64_t delta); */ #if defined(DOXYGEN) || OPAL_ENABLE_DEBUG static inline size_t -opal_atomic_add_size_t(volatile size_t *addr, int delta) +opal_atomic_add_size_t(volatile size_t *addr, size_t delta) { #if SIZEOF_SIZE_T == 4 return (size_t) opal_atomic_add_32((int32_t*) addr, delta); @@ -458,7 +458,7 @@ opal_atomic_add_size_t(volatile size_t *addr, int delta) #endif } static inline size_t -opal_atomic_sub_size_t(volatile size_t *addr, int delta) +opal_atomic_sub_size_t(volatile size_t *addr, size_t delta) { #if SIZEOF_SIZE_T == 4 return (size_t) opal_atomic_sub_32((int32_t*) addr, delta); From c340da25864069c8b179abde44b8242a92ab2515 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 3 Dec 2015 10:47:00 -0500 Subject: [PATCH 2/5] A first cut at the large data problem with TCP. As long as the writev and readv support a sum larger than a uint32_t this version will work. For the other OSes a different patch is required. This patch is a slight modification of the one proposed by @ggouaillardet. Signed-off-by: George Bosilca --- opal/mca/btl/tcp/btl_tcp_frag.c | 17 ++++++++--------- opal/mca/btl/tcp/btl_tcp_frag.h | 4 ++-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index 75901e9c47..a0ff9ebb78 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -112,11 +112,11 @@ size_t mca_btl_tcp_frag_dump(mca_btl_tcp_frag_t* frag, char* msg, char* buf, siz bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) { - ssize_t cnt = -1; + ssize_t cnt; size_t i, num_vecs; /* non-blocking write, but continue if interrupted */ - while(cnt < 0) { + do { cnt = writev(sd, frag->iov_ptr, frag->iov_cnt); if(cnt < 0) { switch(opal_socket_errno) { @@ -140,11 +140,11 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) return false; } } - } + } while(cnt < 0); /* if the write didn't complete - update the iovec state */ num_vecs = frag->iov_cnt; - for(i=0; i= (ssize_t)frag->iov_ptr->iov_len) { cnt -= frag->iov_ptr->iov_len; frag->iov_ptr++; @@ -166,14 +166,14 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) { mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint; - int i, num_vecs, dont_copy_data = 0; ssize_t cnt; + int32_t i, num_vecs, dont_copy_data = 0; repeat: num_vecs = frag->iov_cnt; #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { - size_t length; + ssize_t length; /* It's strange at the first look but cnt have to be set to the full amount of data * available. After going to advance_iov_position we will use cnt to detect if there * is still some data pending. @@ -208,8 +208,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ /* non-blocking read, but continue if interrupted */ - cnt = -1; - while( cnt < 0 ) { + do { cnt = readv(sd, frag->iov_ptr, num_vecs); if( 0 < cnt ) goto advance_iov_position; if( cnt == 0 ) { @@ -247,7 +246,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) mca_btl_tcp_endpoint_close(btl_endpoint); return false; } - } + } while( cnt < 0 ); advance_iov_position: /* if the read didn't complete - update the iovec state */ diff --git a/opal/mca/btl/tcp/btl_tcp_frag.h b/opal/mca/btl/tcp/btl_tcp_frag.h index b73da8f6ed..e1b068502f 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.h +++ b/opal/mca/btl/tcp/btl_tcp_frag.h @@ -53,8 +53,8 @@ struct mca_btl_tcp_frag_t { mca_btl_tcp_hdr_t hdr; struct iovec iov[MCA_BTL_TCP_FRAG_IOVEC_NUMBER + 1]; struct iovec *iov_ptr; - size_t iov_cnt; - size_t iov_idx; + uint32_t iov_cnt; + uint32_t iov_idx; size_t size; uint16_t next_step; int rc; From 050bd3b6d73fe66c12de84696a9849a94dfd374c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 3 Dec 2015 10:48:48 -0500 Subject: [PATCH 3/5] Make the pipeline depth an int instead of a size_t. While they are supposed to be unsigned, casting them to a signed value for all atomic operations is as errorprone as handling them as signed entities. Signed-off-by: George Bosilca --- ompi/mca/pml/ob1/pml_ob1.h | 4 ++-- ompi/mca/pml/ob1/pml_ob1_component.c | 4 ++-- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 8 ++++---- ompi/mca/pml/ob1/pml_ob1_recvreq.h | 12 ++++++------ ompi/mca/pml/ob1/pml_ob1_sendreq.c | 10 +++++----- ompi/mca/pml/ob1/pml_ob1_sendreq.h | 10 +++++----- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index 10162916c6..4826587564 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -55,8 +55,8 @@ struct mca_pml_ob1_t { int free_list_num; /* initial size of free list */ int free_list_max; /* maximum size of free list */ int free_list_inc; /* number of elements to grow free list */ - size_t send_pipeline_depth; - size_t recv_pipeline_depth; + int32_t send_pipeline_depth; + int32_t recv_pipeline_depth; size_t rdma_retries_limit; int max_rdma_per_request; int max_send_per_range; diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index 6557bc2037..1bb14e179e 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -184,8 +184,8 @@ static int mca_pml_ob1_component_register(void) mca_pml_ob1_param_register_int("free_list_max", -1, &mca_pml_ob1.free_list_max); mca_pml_ob1_param_register_int("free_list_inc", 64, &mca_pml_ob1.free_list_inc); mca_pml_ob1_param_register_int("priority", 20, &mca_pml_ob1.priority); - mca_pml_ob1_param_register_sizet("send_pipeline_depth", 3, &mca_pml_ob1.send_pipeline_depth); - mca_pml_ob1_param_register_sizet("recv_pipeline_depth", 4, &mca_pml_ob1.recv_pipeline_depth); + mca_pml_ob1_param_register_int("send_pipeline_depth", 3, &mca_pml_ob1.send_pipeline_depth); + mca_pml_ob1_param_register_int("recv_pipeline_depth", 4, &mca_pml_ob1.recv_pipeline_depth); /* NTH: we can get into a live-lock situation in the RDMA failure path so disable RDMA retries for now. Falling back to send may suck but it is better than diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 5b45410874..3cd4c0d8fb 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -190,7 +190,7 @@ static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t r mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; mca_bml_base_btl_t *bml_btl = frag->rdma_bml; - OPAL_THREAD_SUB_SIZE_T(&recvreq->req_pipeline_depth, 1); + OPAL_THREAD_ADD32(&recvreq->req_pipeline_depth, -1); MCA_PML_OB1_RDMA_FRAG_RETURN(frag); @@ -198,7 +198,7 @@ static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t r assert ((uint64_t) rdma_size == frag->rdma_length); /* check completion status */ - OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size); + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, rdma_size); if (recv_request_pml_complete_check(recvreq) == false && recvreq->req_rdma_offset < recvreq->req_send_offset) { /* schedule additional rdma operations */ @@ -951,7 +951,7 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq, } while(bytes_remaining > 0 && - recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) { + recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) { mca_pml_ob1_rdma_frag_t *frag = NULL; mca_btl_base_module_t *btl; int rc, rdma_idx; @@ -1028,7 +1028,7 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq, if (OPAL_LIKELY(OMPI_SUCCESS == rc)) { /* update request state */ recvreq->req_rdma_offset += size; - OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); + OPAL_THREAD_ADD32(&recvreq->req_pipeline_depth, 1); recvreq->req_rdma[rdma_idx].length -= size; bytes_remaining -= size; } else { diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 6d57569323..82c4767d83 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -41,12 +41,12 @@ BEGIN_C_DECLS struct mca_pml_ob1_recv_request_t { mca_pml_base_recv_request_t req_recv; opal_ptr_t remote_req_send; - int32_t req_lock; - size_t req_pipeline_depth; - size_t req_bytes_received; /**< amount of data transferred into the user buffer */ - size_t req_bytes_expected; /**< local size of the data as suggested by the user */ - size_t req_rdma_offset; - size_t req_send_offset; + int32_t req_lock; + int32_t req_pipeline_depth; + size_t req_bytes_received; /**< amount of data transferred into the user buffer */ + size_t req_bytes_expected; /**< local size of the data as suggested by the user */ + size_t req_rdma_offset; + size_t req_send_offset; uint32_t req_rdma_cnt; uint32_t req_rdma_idx; bool req_pending; diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index f0a227f5dc..f358d733da 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -313,7 +313,7 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl, des->des_segment_count, sizeof(mca_pml_ob1_frag_hdr_t)); - OPAL_THREAD_SUB_SIZE_T(&sendreq->req_pipeline_depth, 1); + OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, -1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); if(send_request_pml_complete_check(sendreq) == false) { @@ -913,13 +913,13 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq) /* check pipeline_depth here before attempting to get any locks */ if(true == sendreq->req_throttle_sends && - sendreq->req_pipeline_depth >= mca_pml_ob1.send_pipeline_depth) + sendreq->req_pipeline_depth >= mca_pml_ob1.send_pipeline_depth) return OMPI_SUCCESS; range = get_send_range(sendreq); while(range && (false == sendreq->req_throttle_sends || - sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) { + sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) { mca_pml_ob1_frag_hdr_t* hdr; mca_btl_base_descriptor_t* des; int rc, btl_idx; @@ -1044,7 +1044,7 @@ cannot_pack: range->range_btls[btl_idx].length -= size; range->range_send_length -= size; range->range_send_offset += size; - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1); + OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, 1); if(range->range_send_length == 0) { range = get_next_send_range(sendreq, range); prev_bytes_remaining = 0; @@ -1060,7 +1060,7 @@ cannot_pack: range->range_btls[btl_idx].length -= size; range->range_send_length -= size; range->range_send_offset += size; - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1); + OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, 1); if(range->range_send_length == 0) { range = get_next_send_range(sendreq, range); prev_bytes_remaining = 0; diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index 80acc93f4e..5cb21f6aba 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -45,11 +45,11 @@ struct mca_pml_ob1_send_request_t { mca_pml_base_send_request_t req_send; mca_bml_base_endpoint_t* req_endpoint; opal_ptr_t req_recv; - int32_t req_state; - int32_t req_lock; - bool req_throttle_sends; - size_t req_pipeline_depth; - size_t req_bytes_delivered; + int32_t req_state; + int32_t req_lock; + bool req_throttle_sends; + int32_t req_pipeline_depth; + size_t req_bytes_delivered; uint32_t req_rdma_cnt; mca_pml_ob1_send_pending_t req_pending; opal_mutex_t req_send_range_lock; From 866899e8364cab4ba3d4e8fc5f286f3ecc791858 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 28 Apr 2017 01:17:52 -0400 Subject: [PATCH 4/5] Always abide to the RDMA pipeline limit. Signed-off-by: George Bosilca --- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 3cd4c0d8fb..bbc90e1e47 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -983,14 +983,10 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq, } while(!size); btl = bml_btl->btl; - /* NTH: This conditional used to check if there was a registration in - * recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to - * the btl not needed registration (equivalent to btl->btl_register_mem - * != NULL. This new check is equivalent. Note: I feel this protocol - * needs work to better improve resource usage when running with a - * leave pinned protocol. */ - if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) && - (size > btl->btl_rdma_pipeline_frag_size)) { + /* NTH: Note: I feel this protocol needs work to better improve resource + * usage when running with a leave pinned protocol. */ + /* GB: We should always abide by the BTL RDMA pipeline fragment limit (if one is set) */ + if ((btl->btl_rdma_pipeline_frag_size != 0) && (size > btl->btl_rdma_pipeline_frag_size)) { size = btl->btl_rdma_pipeline_frag_size; } From d10522a01c448495905b413cdeb8b396b602d29c Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 28 Apr 2017 01:18:29 -0400 Subject: [PATCH 5/5] Set a hard limit on the TCP max fragment size. Some OSes have hardcoded limits to prevent overflowing over an int32_t. We can either detect this at configure (which might be a nicer but incomplete solution), or always force the pipelined protocol over TCP. As it only covers data larger than 1GB, no performance penalty is to be expected. Signed-off-by: George Bosilca --- opal/mca/btl/tcp/btl_tcp_component.c | 13 +++++++++++-- opal/mca/btl/tcp/btl_tcp_frag.c | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index bbaf27f422..40b1b41047 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -318,7 +318,12 @@ static int mca_btl_tcp_component_register(void) mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_max_send_size = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024; - mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX; + /* Some OSes have hard coded limits on how many bytes can be manipulated + * by each writev operation. Force a reasonable limit, to prevent overflowing + * a signed 32-bit integer (limit comes from BSD and OS X). We remove 1k to + * make some room for our internal headers. + */ + mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024); mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0; mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE | @@ -335,7 +340,11 @@ static int mca_btl_tcp_component_register(void) mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version, &mca_btl_tcp_module.super); - + if (mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size > ((1UL<<31) - 1024) ) { + /* Assume a hard limit. A test in configure would be a better solution, but until then + * kicking-in the pipeline RDMA for extremely large data is good enough. */ + mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024); + } mca_btl_tcp_param_register_int ("disable_family", NULL, 0, OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp_disable_family); return mca_btl_tcp_component_verify(); diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index a0ff9ebb78..56775067c9 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -173,7 +173,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) num_vecs = frag->iov_cnt; #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { - ssize_t length; + size_t length; /* It's strange at the first look but cnt have to be set to the full amount of data * available. After going to advance_iov_position we will use cnt to detect if there * is still some data pending.