diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index d82daee29a..eac927c489 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -14,7 +14,7 @@ * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -500,17 +500,17 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr) case MCA_PML_OB1_HDR_TYPE_RGET: type = "RGET"; snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64 - "seg_cnt %d hdr_des %" PRIu64, + "frag %" PRIu64 " src_ptr %" PRIu64, hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src, hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq, - hdr->hdr_rndv.hdr_msg_length, - hdr->hdr_rget.hdr_seg_cnt, hdr->hdr_rget.hdr_des.lval); + hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval, + hdr->hdr_rget.hdr_src_ptr); break; case MCA_PML_OB1_HDR_TYPE_ACK: type = "ACK"; - snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64, + snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64, hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval, - hdr->hdr_ack.hdr_send_offset); + hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size); break; case MCA_PML_OB1_HDR_TYPE_FRAG: type = "FRAG"; @@ -520,10 +520,11 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr) break; case MCA_PML_OB1_HDR_TYPE_PUT: type = "PUT"; - snprintf( header, 128, "seg_cnt %d dst_req %p src_des %p recv_req %p offset %" PRIu64 " [%p %" PRIu64 "]", - hdr->hdr_rdma.hdr_seg_cnt, hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_des.pval, + snprintf( header, 128, "dst_req %p src_frag %p recv_req %p offset %" PRIu64 + " dst_ptr %" PRIu64 " dst_size %" PRIu64, + hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval, hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset, - hdr->hdr_rdma.hdr_segs[0].seg_addr.pval, hdr->hdr_rdma.hdr_segs[0].seg_len); + hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size); break; case MCA_PML_OB1_HDR_TYPE_FIN: type = "FIN"; @@ -638,37 +639,32 @@ static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl, */ int mca_pml_ob1_send_fin( ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, - opal_ptr_t hdr_des, + opal_ptr_t hdr_frag, + uint64_t rdma_size, uint8_t order, - uint32_t status ) + int status ) { mca_btl_base_descriptor_t* fin; - mca_pml_ob1_fin_hdr_t* hdr; int rc; mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL); if(NULL == fin) { - MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); + MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status); return OMPI_ERR_OUT_OF_RESOURCE; } fin->des_cbfunc = mca_pml_ob1_fin_completion; fin->des_cbdata = NULL; /* fill in header */ - hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_local->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; - hdr->hdr_des = hdr_des; - hdr->hdr_fail = status; + mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval, + 0, hdr_frag.lval, status ? status : (int64_t) rdma_size); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc); /* queue request */ - rc = mca_bml_base_send( bml_btl, - fin, - MCA_PML_OB1_HDR_TYPE_FIN ); + rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN ); if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( 1 == rc ) ) { MCA_PML_OB1_PROGRESS_PENDING(bml_btl); @@ -676,7 +672,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc, return OMPI_SUCCESS; } mca_bml_base_free(bml_btl, fin); - MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); + MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status); return OMPI_ERR_OUT_OF_RESOURCE; } @@ -717,6 +713,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl) pckt->hdr.hdr_ack.hdr_src_req.lval, pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_send_offset, + pckt->hdr.hdr_ack.hdr_send_size, pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA); if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); @@ -728,9 +725,10 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl) break; case MCA_PML_OB1_HDR_TYPE_FIN: rc = mca_pml_ob1_send_fin(pckt->proc, send_dst, - pckt->hdr.hdr_fin.hdr_des, + pckt->hdr.hdr_fin.hdr_frag, + pckt->hdr.hdr_fin.hdr_size, pckt->order, - pckt->hdr.hdr_fin.hdr_fail); + pckt->status); if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { return; } diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index e059d7586d..924d3258f8 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -216,6 +216,7 @@ struct mca_pml_ob1_pckt_pending_t { mca_pml_ob1_hdr_t hdr; struct mca_bml_base_btl_t *bml_btl; uint8_t order; + int status; }; typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t; OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t); @@ -234,17 +235,17 @@ do { \ (ompi_free_list_item_t*)pckt); \ } while(0) -#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \ +#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, Sz, B, O, S) \ do { \ mca_pml_ob1_pckt_pending_t *_pckt; \ \ MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \ - _pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \ - _pckt->hdr.hdr_fin.hdr_des = (D); \ - _pckt->hdr.hdr_fin.hdr_fail = (S); \ + mca_pml_ob1_fin_hdr_prepare (&_pckt->hdr.hdr_fin, 0, \ + (D).lval, (Sz)); \ _pckt->proc = (P); \ _pckt->bml_btl = (B); \ _pckt->order = (O); \ + _pckt->status = (S); \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ opal_list_append(&mca_pml_ob1.pckt_pending, \ (opal_list_item_t*)_pckt); \ @@ -253,7 +254,7 @@ do { \ int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, - opal_ptr_t hdr_des, uint8_t order, uint32_t status); + opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status); /* This function tries to resend FIN/ACK packets from pckt_pending queue. * Packets are added to the queue when sending of FIN or ACK is failed due to @@ -283,20 +284,6 @@ void mca_pml_ob1_process_pending_rdma(void); /* * Compute the total number of bytes on supplied descriptor */ -static inline size_t -mca_pml_ob1_compute_segment_length(size_t seg_size, void *segments, - size_t count, size_t hdrlen) -{ - size_t i, length = 0; - mca_btl_base_segment_t *segment = (mca_btl_base_segment_t*)segments; - - for (i = 0; i < count ; ++i) { - length += segment->seg_len; - segment = (mca_btl_base_segment_t *)((char *)segment + seg_size); - } - return (length - hdrlen); -} - static inline size_t mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments, size_t count, size_t hdrlen) @@ -338,7 +325,7 @@ mca_pml_ob1_compute_segment_length_remote (size_t seg_size, void *segments, /* represent BTL chosen for sending request */ struct mca_pml_ob1_com_btl_t { mca_bml_base_btl_t *bml_btl; - struct mca_mpool_base_registration_t* btl_reg; + struct mca_btl_base_registration_handle_t *btl_reg; size_t length; }; typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t; diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c index 3574bb390d..29462932f1 100644 --- a/ompi/mca/pml/ob1/pml_ob1_cuda.c +++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -12,6 +13,8 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -123,19 +126,20 @@ size_t mca_pml_ob1_rdma_cuda_btls( mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n); if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) { - mca_mpool_base_registration_t* reg = NULL; - mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; + mca_btl_base_registration_handle_t *handle = NULL; - if( NULL != btl_mpool ) { + if( NULL != bml_btl->btl->btl_register_mem ) { /* register the memory */ - btl_mpool->mpool_register(btl_mpool, base, size, MCA_MPOOL_FLAGS_CUDA_GPU_MEM, ®); + handle = bml_btl->btl->btl_register_mem (bml_btl->btl, bml_btl->btl_endpoint, + base, size, MCA_BTL_REG_FLAG_CUDA_GPU_MEM | + MCA_BTL_REG_FLAG_REMOTE_READ); } - if(NULL == reg) + if(NULL == handle) continue; rdma_btls[num_btls_used].bml_btl = bml_btl; - rdma_btls[num_btls_used].btl_reg = reg; + rdma_btls[num_btls_used].btl_reg = handle; weight_total += bml_btl->btl_weight; num_btls_used++; } diff --git a/ompi/mca/pml/ob1/pml_ob1_hdr.h b/ompi/mca/pml/ob1/pml_ob1_hdr.h index 71e52ae608..e53f4afd90 100644 --- a/ompi/mca/pml/ob1/pml_ob1_hdr.h +++ b/ompi/mca/pml/ob1/pml_ob1_hdr.h @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -64,6 +64,13 @@ struct mca_pml_ob1_common_hdr_t { }; typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t; +static inline void mca_pml_ob1_common_hdr_prepare (mca_pml_ob1_common_hdr_t *hdr, uint8_t hdr_type, + uint8_t hdr_flags) +{ + hdr->hdr_type = hdr_type; + hdr->hdr_flags = hdr_flags; +} + #define MCA_PML_OB1_COMMON_HDR_NTOH(h) #define MCA_PML_OB1_COMMON_HDR_HTON(h) @@ -89,15 +96,19 @@ struct mca_pml_ob1_match_hdr_t { typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t; +static inline void mca_pml_ob1_match_hdr_prepare (mca_pml_ob1_match_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags, + uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, hdr_type, hdr_flags); + hdr->hdr_ctx = hdr_ctx; + hdr->hdr_src = hdr_src; + hdr->hdr_tag = hdr_tag; + hdr->hdr_seq = hdr_seq; #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_MATCH_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ -} while(0) -#else -#define MCA_PML_OB1_MATCH_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; +#endif +} #define MCA_PML_OB1_MATCH_HDR_NTOH(h) \ do { \ @@ -111,7 +122,6 @@ do { \ #define MCA_PML_OB1_MATCH_HDR_HTON(h) \ do { \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_MATCH_HDR_FILL(h); \ (h).hdr_ctx = htons((h).hdr_ctx); \ (h).hdr_src = htonl((h).hdr_src); \ (h).hdr_tag = htonl((h).hdr_tag); \ @@ -130,12 +140,14 @@ struct mca_pml_ob1_rendezvous_hdr_t { }; typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_RNDV_HDR_FILL(h) \ - MCA_PML_OB1_MATCH_HDR_FILL((h).hdr_match) -#else -#define MCA_PML_OB1_RNDV_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ +static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags, + uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq, + uint64_t hdr_msg_length, void *hdr_src_req) +{ + mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, hdr_type, hdr_flags, hdr_ctx, hdr_src, hdr_tag, hdr_seq); + hdr->hdr_msg_length = hdr_msg_length; + hdr->hdr_src_req.pval = hdr_src_req; +} /* Note that hdr_src_req is not put in network byte order because it is never processed by the receiver, other than being copied into @@ -149,7 +161,6 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; #define MCA_PML_OB1_RNDV_HDR_HTON(h) \ do { \ MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \ - MCA_PML_OB1_RNDV_HDR_FILL(h); \ (h).hdr_msg_length = hton64((h).hdr_msg_length); \ } while (0) @@ -158,38 +169,47 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; */ struct mca_pml_ob1_rget_hdr_t { mca_pml_ob1_rendezvous_hdr_t hdr_rndv; - uint32_t hdr_seg_cnt; /**< number of segments for rdma */ #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT uint8_t hdr_padding[4]; #endif - opal_ptr_t hdr_des; /**< source descriptor */ + opal_ptr_t hdr_frag; /**< source fragment (for fin) */ + uint64_t hdr_src_ptr; /**< source pointer */ + /* btl registration handle data follows */ }; typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t; +static inline void mca_pml_ob1_rget_hdr_prepare (mca_pml_ob1_rget_hdr_t *hdr, uint8_t hdr_flags, + uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq, + uint64_t hdr_msg_length, void *hdr_src_req, void *hdr_frag, + void *hdr_src_ptr, void *local_handle, size_t local_handle_size) +{ + mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RGET, hdr_flags, + hdr_ctx, hdr_src, hdr_tag, hdr_seq, hdr_msg_length, hdr_src_req); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_RGET_HDR_FILL(h) \ -do { \ - MCA_PML_OB1_RNDV_HDR_FILL((h).hdr_rndv); \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ - (h).hdr_padding[2] = 0; \ - (h).hdr_padding[3] = 0; \ -} while(0) -#else -#define MCA_PML_OB1_RGET_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; + hdr->hdr_padding[2] = 0; + hdr->hdr_padding[3] = 0; +#endif + hdr->hdr_frag.pval = hdr_frag; + hdr->hdr_src_ptr = (uint64_t)(intptr_t) hdr_src_ptr; -#define MCA_PML_OB1_RGET_HDR_NTOH(h) \ - do { \ - MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \ - (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + /* copy registration handle */ + memcpy (hdr + 1, local_handle, local_handle_size); +} + +#define MCA_PML_OB1_RGET_HDR_NTOH(h) \ + do { \ + MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \ + (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + (h).hdr_src_ptr = ntoh64((h).hdr_src_ptr); \ } while (0) -#define MCA_PML_OB1_RGET_HDR_HTON(h) \ - do { \ - MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \ - MCA_PML_OB1_RGET_HDR_FILL(h); \ - (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ +#define MCA_PML_OB1_RGET_HDR_HTON(h) \ + do { \ + MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \ + (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ + (h).hdr_src_ptr = hton64((h).hdr_src_ptr); \ } while (0) /** @@ -206,19 +226,23 @@ struct mca_pml_ob1_frag_hdr_t { }; typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t; +static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, uint8_t hdr_flags, + uint64_t hdr_frag_offset, void *hdr_src_req, + uint64_t hdr_dst_req) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FRAG, hdr_flags); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_FRAG_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ - (h).hdr_padding[2] = 0; \ - (h).hdr_padding[3] = 0; \ - (h).hdr_padding[4] = 0; \ - (h).hdr_padding[5] = 0; \ -} while(0) -#else -#define MCA_PML_OB1_FRAG_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; + hdr->hdr_padding[2] = 0; + hdr->hdr_padding[3] = 0; + hdr->hdr_padding[4] = 0; + hdr->hdr_padding[5] = 0; +#endif + hdr->hdr_frag_offset = hdr_frag_offset; + hdr->hdr_src_req.pval = hdr_src_req; + hdr->hdr_dst_req.lval = hdr_dst_req; +} #define MCA_PML_OB1_FRAG_HDR_NTOH(h) \ do { \ @@ -229,7 +253,6 @@ do { \ #define MCA_PML_OB1_FRAG_HDR_HTON(h) \ do { \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_FRAG_HDR_FILL(h); \ (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \ } while (0) @@ -245,38 +268,45 @@ struct mca_pml_ob1_ack_hdr_t { opal_ptr_t hdr_src_req; /**< source request */ opal_ptr_t hdr_dst_req; /**< matched receive request */ uint64_t hdr_send_offset; /**< starting point of copy in/out */ + uint64_t hdr_send_size; /**< number of bytes requested (0: all remaining) */ }; typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t; +static inline void mca_pml_ob1_ack_hdr_prepare (mca_pml_ob1_ack_hdr_t *hdr, uint8_t hdr_flags, + uint64_t hdr_src_req, void *hdr_dst_req, + uint64_t hdr_send_offset, uint64_t hdr_send_size) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_ACK, hdr_flags); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_ACK_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ - (h).hdr_padding[2] = 0; \ - (h).hdr_padding[3] = 0; \ - (h).hdr_padding[4] = 0; \ - (h).hdr_padding[5] = 0; \ -} while (0) -#else -#define MCA_PML_OB1_ACK_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; + hdr->hdr_padding[2] = 0; + hdr->hdr_padding[3] = 0; + hdr->hdr_padding[4] = 0; + hdr->hdr_padding[5] = 0; +#endif + hdr->hdr_src_req.lval = hdr_src_req; + hdr->hdr_dst_req.pval = hdr_dst_req; + hdr->hdr_send_offset = hdr_send_offset; + hdr->hdr_send_size = hdr_send_size; +} /* Note that the request headers are not put in NBO because the src_req is already in receiver's byte order and the dst_req is not used by the receiver for anything other than backpointers in return headers */ -#define MCA_PML_OB1_ACK_HDR_NTOH(h) \ - do { \ - MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ +#define MCA_PML_OB1_ACK_HDR_NTOH(h) \ + do { \ + MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ (h).hdr_send_offset = ntoh64((h).hdr_send_offset); \ + (h).hdr_send_size = ntoh64((h).hdr_send_size); \ } while (0) -#define MCA_PML_OB1_ACK_HDR_HTON(h) \ - do { \ - MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_ACK_HDR_FILL(h); \ +#define MCA_PML_OB1_ACK_HDR_HTON(h) \ + do { \ + MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ (h).hdr_send_offset = hton64((h).hdr_send_offset); \ + (h).hdr_send_size = hton64((h).hdr_send_size); \ } while (0) /** @@ -288,38 +318,55 @@ struct mca_pml_ob1_rdma_hdr_t { #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */ #endif - uint32_t hdr_seg_cnt; /**< number of segments for rdma */ + /* TODO: add real support for multiple destination segments */ opal_ptr_t hdr_req; /**< destination request */ - opal_ptr_t hdr_des; /**< source descriptor */ + opal_ptr_t hdr_frag; /**< receiver fragment */ opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */ - uint64_t hdr_rdma_offset; /**< current offset into user buffer */ - mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ + uint64_t hdr_rdma_offset; /**< current offset into user buffer */ + uint64_t hdr_dst_ptr; /**< destination address */ + uint64_t hdr_dst_size; /**< destination size */ + /* registration data follows */ }; typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t; +static inline void mca_pml_ob1_rdma_hdr_prepare (mca_pml_ob1_rdma_hdr_t *hdr, uint8_t hdr_flags, + uint64_t hdr_req, void *hdr_frag, void *hdr_recv_req, + uint64_t hdr_rdma_offset, void *hdr_dst_ptr, + uint64_t hdr_dst_size, void *local_handle, + size_t local_handle_size) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_PUT, hdr_flags); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_RDMA_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ -} while(0) -#else -#define MCA_PML_OB1_RDMA_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; +#endif + hdr->hdr_req.lval = hdr_req; + hdr->hdr_frag.pval = hdr_frag; + hdr->hdr_recv_req.pval = hdr_recv_req; + hdr->hdr_rdma_offset = hdr_rdma_offset; + hdr->hdr_dst_ptr = (uint64_t)(intptr_t) hdr_dst_ptr; + hdr->hdr_dst_size = hdr_dst_size; -#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \ - do { \ - MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ - (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ + /* copy segments */ + memcpy (hdr + 1, local_handle, local_handle_size); +} + +#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \ + do { \ + MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ (h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \ + (h).hdr_dst_ptr = ntoh64((h).hdr_dst_ptr); \ + (h).hdr_dst_size = ntoh64((h).hdr_dst_size); \ } while (0) -#define MCA_PML_OB1_RDMA_HDR_HTON(h) \ - do { \ - MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_RDMA_HDR_FILL(h); \ - (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ +#define MCA_PML_OB1_RDMA_HDR_HTON(h) \ + do { \ + MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ + (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ (h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \ + (h).hdr_dst_ptr = hton64((h).hdr_dst_ptr); \ + (h).hdr_dst_size = hton64((h).hdr_dst_size); \ } while (0) /** @@ -331,31 +378,34 @@ struct mca_pml_ob1_fin_hdr_t { #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT uint8_t hdr_padding[2]; #endif - uint32_t hdr_fail; /**< RDMA operation failed */ - opal_ptr_t hdr_des; /**< completed descriptor */ + int64_t hdr_size; /**< number of bytes completed (positive), error code (negative) */ + opal_ptr_t hdr_frag; /**< completed RDMA fragment */ }; typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t; +static inline void mca_pml_ob1_fin_hdr_prepare (mca_pml_ob1_fin_hdr_t *hdr, uint8_t hdr_flags, + uint64_t hdr_frag, int64_t hdr_size) +{ + mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FIN, hdr_flags); #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG -#define MCA_PML_OB1_FIN_HDR_FILL(h) \ -do { \ - (h).hdr_padding[0] = 0; \ - (h).hdr_padding[1] = 0; \ -} while (0) -#else -#define MCA_PML_OB1_FIN_HDR_FILL(h) -#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ + hdr->hdr_padding[0] = 0; + hdr->hdr_padding[1] = 0; +#endif + hdr->hdr_frag.lval = hdr_frag; + hdr->hdr_size = hdr_size; +} -#define MCA_PML_OB1_FIN_HDR_NTOH(h) \ - do { \ +#define MCA_PML_OB1_FIN_HDR_NTOH(h) \ + do { \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ + (h).hdr_size = ntoh64((h).hdr_size); \ } while (0) -#define MCA_PML_OB1_FIN_HDR_HTON(h) \ - do { \ +#define MCA_PML_OB1_FIN_HDR_HTON(h) \ + do { \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - MCA_PML_OB1_FIN_HDR_FILL(h); \ - } while (0) + (h).hdr_size = hton64((h).hdr_size); \ + } while (0) /** * Union of defined hdr types. diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index 852cf5fad5..157cddd730 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science @@ -68,7 +68,6 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count, ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint, ompi_communicator_t * comm) { - mca_btl_base_descriptor_t *des = NULL; mca_pml_ob1_match_hdr_t match; mca_bml_base_btl_t *bml_btl; opal_convertor_t convertor; @@ -98,28 +97,21 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count, size = 0; } - match.hdr_common.hdr_flags = 0; - match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - match.hdr_ctx = comm->c_contextid; - match.hdr_src = comm->c_my_rank; - match.hdr_tag = tag; - match.hdr_seq = seqn; + mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0, + comm->c_contextid, comm->c_my_rank, + tag, seqn); ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc); /* try to send immediately */ rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN, size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, - MCA_PML_OB1_HDR_TYPE_MATCH, &des); + MCA_PML_OB1_HDR_TYPE_MATCH, NULL); if (count > 0) { opal_convertor_cleanup (&convertor); } if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - if (des) { - mca_bml_base_free (bml_btl, des); - } - return rc; } @@ -224,7 +216,7 @@ int mca_pml_ob1_send(void *buf, OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t); sendreq->req_send.req_base.req_proc = dst_proc; - sendreq->src_des = NULL; + sendreq->rdma_frag = NULL; MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, buf, diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c index e1afda0689..c2c9bbbe89 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdma.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,11 +30,6 @@ #include "pml_ob1.h" #include "pml_ob1_rdma.h" -/* Use this registration if no registration needed for a BTL instead of NULL. - * This will help other code to distinguish case when memory is not registered - * from case when registration is not needed */ -static mca_mpool_base_registration_t pml_ob1_dummy_reg; - /* * Check to see if memory is registered or can be registered. Build a * set of registrations on the request. @@ -45,7 +43,7 @@ size_t mca_pml_ob1_rdma_btls( { int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); double weight_total = 0; - int num_btls_used = 0, n; + int num_btls_used = 0; /* shortcut when there are no rdma capable btls */ if(num_btls == 0) { @@ -53,29 +51,33 @@ size_t mca_pml_ob1_rdma_btls( } /* check to see if memory is registered */ - for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; - n++) { + for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) { mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, - (bml_endpoint->btl_rdma_index + n) % num_btls); - mca_mpool_base_registration_t* reg = &pml_ob1_dummy_reg; - mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; + (bml_endpoint->btl_rdma_index + n) % num_btls); + mca_btl_base_registration_handle_t *reg_handle = NULL; + mca_btl_base_module_t *btl = bml_btl->btl; - if( NULL != btl_mpool ) { - if(!mca_pml_ob1.leave_pinned) { - /* look through existing registrations */ - btl_mpool->mpool_find(btl_mpool, base, size, ®); - } else { - /* register the memory */ - btl_mpool->mpool_register(btl_mpool, base, size, 0, ®); + if (btl->btl_register_mem) { + /* do not use the RDMA protocol with this btl if 1) leave pinned is disabled, + * 2) the btl supports put, and 3) the fragment is larger than the minimum + * pipeline size specified by the BTL */ + if (!mca_pml_ob1.leave_pinned && (btl->btl_flags & MCA_BTL_FLAGS_PUT) && + size > btl->btl_min_rdma_pipeline_size) { + continue; } - if(NULL == reg) + /* try to register the memory region with the btl */ + reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base, + size, MCA_BTL_REG_FLAG_REMOTE_READ); + if (NULL == reg_handle) { + /* btl requires registration but the registration failed */ continue; - } + } + } /* else no registration is needed with this btl */ rdma_btls[num_btls_used].bml_btl = bml_btl; - rdma_btls[num_btls_used].btl_reg = reg; + rdma_btls[num_btls_used].btl_reg = reg_handle; weight_total += bml_btl->btl_weight; num_btls_used++; } @@ -83,7 +85,7 @@ size_t mca_pml_ob1_rdma_btls( /* if we don't use leave_pinned and all BTLs that already have this memory * registered amount to less then half of available bandwidth - fall back to * pipeline protocol */ - if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) + if (0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) return 0; mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size, @@ -103,10 +105,7 @@ size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint, for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) { rdma_btls[i].bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); - if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool) - rdma_btls[i].btl_reg = NULL; - else - rdma_btls[i].btl_reg = &pml_ob1_dummy_reg; + rdma_btls[i].btl_reg = NULL; weight_total += rdma_btls[i].bml_btl->btl_weight; } diff --git a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.c b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.c index c814141a8e..cc13628eb4 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -21,9 +24,13 @@ #include "pml_ob1.h" #include "pml_ob1_rdmafrag.h" +static void mca_pml_ob1_rdma_frag_constructor (mca_pml_ob1_rdma_frag_t *frag) +{ + frag->local_handle = NULL; +} OBJ_CLASS_INSTANCE( mca_pml_ob1_rdma_frag_t, ompi_free_list_item_t, - NULL, + mca_pml_ob1_rdma_frag_constructor, NULL); diff --git a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h index 287daed022..132c962833 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h +++ b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h @@ -10,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,38 +34,52 @@ typedef enum { MCA_PML_OB1_RDMA_GET } mca_pml_ob1_rdma_state_t; +struct mca_pml_ob1_rdma_frag_t; + +typedef void (*mca_pml_ob1_rdma_frag_callback_t)(struct mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length); + +/** + * Used to keep track of local and remote RDMA operations. + */ struct mca_pml_ob1_rdma_frag_t { ompi_free_list_item_t super; - mca_bml_base_btl_t* rdma_bml; + mca_bml_base_btl_t *rdma_bml; mca_pml_ob1_hdr_t rdma_hdr; mca_pml_ob1_rdma_state_t rdma_state; size_t rdma_length; - uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS]; void *rdma_req; - struct mca_bml_base_endpoint_t* rdma_ep; - opal_convertor_t convertor; - mca_mpool_base_registration_t* reg; uint32_t retries; + mca_pml_ob1_rdma_frag_callback_t cbfunc; + + uint64_t rdma_offset; + void *local_address; + mca_btl_base_registration_handle_t *local_handle; + + uint64_t remote_address; + uint8_t remote_handle[MCA_BTL_REG_HANDLE_MAX_SIZE]; }; typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t; OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t); -#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \ -do { \ - ompi_free_list_item_t* item; \ +#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \ + do { \ + ompi_free_list_item_t* item; \ OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \ - frag = (mca_pml_ob1_rdma_frag_t*)item; \ -} while(0) - -#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \ -do { \ - /* return fragment */ \ - OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \ - (ompi_free_list_item_t*)frag); \ + frag = (mca_pml_ob1_rdma_frag_t*)item; \ } while(0) +#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \ + do { \ + /* return fragment */ \ + if (frag->local_handle) { \ + mca_bml_base_deregister_mem (frag->rdma_bml, frag->local_handle); \ + frag->local_handle = NULL; \ + } \ + OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \ + (ompi_free_list_item_t*)frag); \ + } while (0) END_C_DECLS diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 337496dc7b..e37e1e7144 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -13,7 +13,7 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -108,13 +108,13 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval; ompi_communicator_t *comm_ptr; mca_pml_ob1_recv_request_t *match = NULL; mca_pml_ob1_comm_t *comm; mca_pml_ob1_comm_proc_t *proc; - size_t num_segments = des->des_local_count; + size_t num_segments = des->des_segment_count; size_t bytes_received = 0; assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS); @@ -256,7 +256,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { @@ -264,7 +264,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl, } ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV); mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments, - des->des_local_count, MCA_PML_OB1_HDR_TYPE_RNDV); + des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RNDV); return; } @@ -273,7 +273,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { @@ -281,7 +281,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl, } ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET); mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments, - des->des_local_count, MCA_PML_OB1_HDR_TYPE_RGET); + des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RGET); return; } @@ -292,9 +292,10 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_send_request_t* sendreq; + size_t size; if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { return; @@ -307,19 +308,25 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl, /* if the request should be delivered entirely by copy in/out * then throttle sends */ if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) { - if (NULL != sendreq->src_des) { - /* release registered memory */ - mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des); - sendreq->src_des = NULL; + if (NULL != sendreq->rdma_frag) { + if (NULL != sendreq->rdma_frag->local_handle) { + mca_bml_base_deregister_mem (sendreq->req_rdma[0].bml_btl, sendreq->rdma_frag->local_handle); + sendreq->rdma_frag->local_handle = NULL; + } + MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag); + sendreq->rdma_frag = NULL; } sendreq->req_throttle_sends = true; } - - mca_pml_ob1_send_request_copy_in_out(sendreq, - hdr->hdr_ack.hdr_send_offset, - sendreq->req_send.req_bytes_packed - - hdr->hdr_ack.hdr_send_offset); + + if (hdr->hdr_ack.hdr_send_size) { + size = hdr->hdr_ack.hdr_send_size; + } else { + size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset; + } + + mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size); if (sendreq->req_state != 0) { /* Typical receipt of an ACK message causes req_state to be @@ -355,13 +362,14 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_recv_request_t* recvreq; if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { return; } + ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG); recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ @@ -372,7 +380,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl, assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV); /* This will trigger the opal_convertor_pack to start asynchronous copy. */ - mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_local_count,des); + mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_segment_count,des); /* Let BTL know that it CANNOT free the frag */ des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC; @@ -380,7 +388,8 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl, return; } #endif /* OPAL_CUDA_SUPPORT */ - mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count); + + mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count); return; } @@ -390,7 +399,7 @@ void mca_pml_ob1_recv_frag_callback_put(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; + mca_btl_base_segment_t* segments = des->des_segments; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_send_request_t* sendreq; @@ -410,20 +419,17 @@ void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { - mca_btl_base_segment_t* segments = des->des_local; - mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; - mca_btl_base_descriptor_t* rdma; + mca_btl_base_segment_t* segments = des->des_segments; + mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval; + mca_pml_ob1_rdma_frag_t *frag; - if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { + if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) { return; } - + ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN); - rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval; - rdma->des_cbfunc(btl, NULL, rdma, - hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS); - - return; + frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval; + frag->cbfunc (frag, hdr->hdr_size); } @@ -699,7 +705,7 @@ out_of_order_match: OPAL_THREAD_UNLOCK(&comm->matching_lock); if(OPAL_LIKELY(match)) { - switch(type) { + switch(type) { case MCA_PML_OB1_HDR_TYPE_MATCH: mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments); break; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index a8206af0ca..f66fe5dfc2 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -13,7 +13,7 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2012 FUJITSU LIMITED. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -150,12 +150,17 @@ static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* reque request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free; request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel; request->req_rdma_cnt = 0; + request->local_handle = NULL; OBJ_CONSTRUCT(&request->lock, opal_mutex_t); } static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request) { OBJ_DESTRUCT(&request->lock); + if (OPAL_UNLIKELY(request->local_handle)) { + mca_bml_base_deregister_mem (request->rdma_bml, request->local_handle); + request->local_handle = NULL; + } } OBJ_CLASS_INSTANCE( @@ -183,31 +188,27 @@ static void mca_pml_ob1_recv_ctl_completion( mca_btl_base_module_t* btl, * Put operation has completed remotely - update request status */ -static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status ) +static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_size) { - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; - mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata; - size_t bytes_received = 0; + mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; - if( OPAL_LIKELY(status == OMPI_SUCCESS) ) { - bytes_received = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, - (void *) des->des_local, - des->des_local_count, 0); - } OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); - mca_bml_base_free(bml_btl, des); + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); - /* check completion status */ - OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); - if(recv_request_pml_complete_check(recvreq) == false && + if (OPAL_LIKELY(0 < rdma_size)) { + assert (rdma_size == frag->rdma_length); + + /* check completion status */ + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size); + if (recv_request_pml_complete_check(recvreq) == false && recvreq->req_rdma_offset < recvreq->req_send_offset) { - /* schedule additional rdma operations */ - mca_pml_ob1_recv_request_schedule(recvreq, bml_btl); + /* schedule additional rdma operations */ + mca_pml_ob1_recv_request_schedule(recvreq, bml_btl); + } } + MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } @@ -218,7 +219,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, int mca_pml_ob1_recv_request_ack_send_btl( ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, - bool nordma) + uint64_t size, bool nordma) { mca_btl_base_descriptor_t* des; mca_pml_ob1_ack_hdr_t* ack; @@ -234,12 +235,9 @@ int mca_pml_ob1_recv_request_ack_send_btl( } /* fill out header */ - ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval; - ack->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK; - ack->hdr_common.hdr_flags = nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0; - ack->hdr_src_req.lval = hdr_src_req; - ack->hdr_dst_req.pval = hdr_dst_req; - ack->hdr_send_offset = hdr_send_offset; + ack = (mca_pml_ob1_ack_hdr_t*)des->des_segments->seg_addr.pval; + mca_pml_ob1_ack_hdr_prepare (ack, nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0, + hdr_src_req, hdr_dst_req, hdr_send_offset, size); ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc); @@ -313,63 +311,99 @@ static int mca_pml_ob1_recv_request_ack( if(recvreq->req_send_offset == hdr->hdr_msg_length) return OMPI_SUCCESS; } + /* let know to shedule function there is no need to put ACK flag */ recvreq->req_ack_sent = true; return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval, - recvreq, recvreq->req_send_offset, + recvreq, recvreq->req_send_offset, 0, recvreq->req_send_offset == bytes_received); } +static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag); + +static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc) +{ + mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; + ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc; + + if (OMPI_ERR_NOT_AVAILABLE == rc) { + /* get isn't supported for this transfer. tell peer to fallback on put */ + rc = mca_pml_ob1_recv_request_put_frag (frag); + if (OMPI_ERR_OUT_OF_RESOURCE == rc) { + OPAL_THREAD_LOCK(&mca_pml_ob1.lock); + opal_list_append (&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); + + return OMPI_SUCCESS; + } + } + + if (++frag->retries < mca_pml_ob1.rdma_retries_limit && + OMPI_ERR_OUT_OF_RESOURCE == rc) { + OPAL_THREAD_LOCK(&mca_pml_ob1.lock); + opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); + + return OMPI_SUCCESS; + } + + /* tell peer to fall back on send for this region */ + rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval, + recvreq, frag->rdma_offset, frag->rdma_length, false); + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); + return rc; +} + /** * Return resources used by the RDMA */ -static void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status ) +static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status) { - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; - mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; - mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req; + mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context; + mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata; + mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; /* check completion status */ - if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { - /* TSW - FIX */ - OMPI_ERROR_LOG(status); - ompi_rte_abort(-1, NULL); + if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + status = mca_pml_ob1_recv_request_get_frag_failed (frag, status); + if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + /* TSW - FIX */ + OMPI_ERROR_LOG(status); + ompi_rte_abort(-1, NULL); + } + } else { + /* is receive request complete */ + OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); + /* TODO: re-add order */ + mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc, + bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag, + frag->rdma_length, 0, 0); + + recv_request_pml_complete_check(recvreq); + + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); } - /* is receive request complete */ - OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); - if (recvreq->req_recv.req_bytes_packed <= recvreq->req_bytes_received) { - mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc, - bml_btl, - frag->rdma_hdr.hdr_rget.hdr_des, - des->order, 0); - } - - recv_request_pml_complete_check(recvreq); - - MCA_PML_OB1_RDMA_FRAG_RETURN(frag); - MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } -static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, - mca_btl_base_descriptor_t *dst) { +static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag) +{ mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; mca_bml_base_btl_t *bml_btl = frag->rdma_bml; mca_btl_base_descriptor_t *ctl; mca_pml_ob1_rdma_hdr_t *hdr; - size_t seg_size; + size_t reg_size; int rc; - seg_size = bml_btl->btl->btl_seg_size * dst->des_local_count; + reg_size = bml_btl->btl->btl_registration_handle_size; /* prepare a descriptor for rdma control message */ - mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size, + mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + reg_size, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL); if (OPAL_UNLIKELY(NULL == ctl)) { @@ -378,26 +412,19 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion; /* fill in rdma header */ - hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; - hdr->hdr_common.hdr_flags = - (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0; + hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_segments->seg_addr.pval; + mca_pml_ob1_rdma_hdr_prepare (hdr, (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0, + recvreq->remote_req_send.lval, frag, recvreq, frag->rdma_offset, + frag->local_address, frag->rdma_length, frag->local_handle, + reg_size); - hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req; - hdr->hdr_rdma_offset = recvreq->req_rdma_offset; - hdr->hdr_des.pval = dst; - hdr->hdr_recv_req.pval = recvreq; + frag->cbfunc = mca_pml_ob1_put_completion; - hdr->hdr_seg_cnt = dst->des_local_count; + recvreq->req_ack_sent = true; - /* copy segments */ - memcpy (hdr + 1, dst->des_local, seg_size); - - dst->des_cbfunc = mca_pml_ob1_put_completion; - dst->des_cbdata = recvreq; - - if (!recvreq->req_ack_sent) - recvreq->req_ack_sent = true; + PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, + &(recvreq->req_recv.req_base), size, + PERUSE_RECV); /* send rdma request to peer */ rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); @@ -412,71 +439,30 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, /* * */ -int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag ) +int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag) { - mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req; - mca_bml_base_btl_t* bml_btl = frag->rdma_bml; - mca_btl_base_descriptor_t* descriptor; - size_t save_size = frag->rdma_length; + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; int rc; /* prepare descriptor */ - mca_bml_base_prepare_dst( bml_btl, - NULL, - &recvreq->req_recv.req_base.req_convertor, - MCA_BTL_NO_ORDER, - 0, - &frag->rdma_length, - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK | - MCA_BTL_DES_FLAGS_GET, - &descriptor ); - if( OPAL_UNLIKELY(NULL == descriptor) ) { - if (frag->retries < mca_pml_ob1.rdma_retries_limit) { - frag->rdma_length = save_size; - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); - opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - return OMPI_ERR_OUT_OF_RESOURCE; - } else { - ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc; - - /* tell peer to fall back on send */ - recvreq->req_send_offset = 0; - rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval, - recvreq, recvreq->req_send_offset, true); - MCA_PML_OB1_RDMA_FRAG_RETURN(frag); - return rc; + if (bml_btl->btl->btl_register_mem && !frag->local_handle) { + mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, MCA_BTL_REG_FLAG_LOCAL_WRITE | + MCA_BTL_REG_FLAG_REMOTE_WRITE, &frag->local_handle); + if (OPAL_UNLIKELY(NULL == frag->local_handle)) { + return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE); } } - descriptor->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs; - descriptor->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; - descriptor->des_cbfunc = mca_pml_ob1_rget_completion; - descriptor->des_cbdata = frag; - PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE, - &(recvreq->req_recv.req_base), + &(((mca_pml_ob1_recv_request_t *) frag->rdma_req)->req_recv.req_base), frag->rdma_length, PERUSE_RECV); /* queue up get request */ - rc = mca_bml_base_get(bml_btl,descriptor); + rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, frag->local_handle, + (mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length, + 0, MCA_BTL_NO_ORDER, mca_pml_ob1_rget_completion, frag); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) { - /* get isn't supported for this transfer. tell peer to fallback on put */ - rc = mca_pml_ob1_init_get_fallback (frag, descriptor); - } - - if(OMPI_ERR_OUT_OF_RESOURCE == rc) { - mca_bml_base_free(bml_btl, descriptor); - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); - opal_list_append(&mca_pml_ob1.rdma_pending, - (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - return OMPI_ERR_OUT_OF_RESOURCE; - } else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - OMPI_ERROR_LOG(rc); - ompi_rte_abort(-1, NULL); - } + return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE); } return OMPI_SUCCESS; @@ -502,6 +488,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments, sizeof(mca_pml_ob1_frag_hdr_t)); data_offset = hdr->hdr_frag.hdr_frag_offset; + /* * Make user buffer accessible(defined) before unpacking. */ @@ -573,7 +560,7 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr /* Store the receive request in unused context pointer. */ des->des_context = (void *)recvreq; /* Store the amount of bytes in unused remote count value */ - des->des_remote_count = bytes_delivered; + des->des_segment_count = bytes_delivered; /* Then record an event that will get triggered by a PML progress call which * checks the stream events. If we get an error, abort. Should get message * from CUDA code about what went wrong. */ @@ -598,7 +585,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl, int status ) { mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context; - size_t bytes_received = des->des_remote_count; + size_t bytes_received = des->des_segment_count; OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des)); /* Call into the BTL so it can free the descriptor. At this point, it is @@ -629,7 +616,6 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval; mca_bml_base_endpoint_t* bml_endpoint = NULL; size_t bytes_remaining, prev_sent, offset; - mca_btl_base_segment_t *r_segments; mca_pml_ob1_rdma_frag_t *frag; mca_bml_base_btl_t *rdma_bml; int rc; @@ -637,6 +623,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq prev_sent = offset = 0; bytes_remaining = hdr->hdr_rndv.hdr_msg_length; recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; + recvreq->req_send_offset = 0; MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match); @@ -680,8 +667,10 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq ompi_rte_abort(-1, NULL); } - bytes_remaining = mca_pml_ob1_compute_segment_length_remote (btl->btl_seg_size, (void *)(hdr + 1), - hdr->hdr_seg_cnt, recvreq->req_recv.req_base.req_proc); + bytes_remaining = hdr->hdr_rndv.hdr_msg_length; + + /* save the request for put fallback */ + recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req; /* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num * of bytes left to be send. In each iteration we send the max possible bytes supported @@ -690,7 +679,12 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq * the next iteration with the updated size. * Also - In each iteration we update the location in the buffer to be used for writing * the message ,and the location to read from. This is done using the offset variable that - * accumulates the number of bytes that were sent so far. */ + * accumulates the number of bytes that were sent so far. + * + * NTH: This fragmentation may go away if we change the btls to require them to handle + * get fragmentation internally. This is a reasonable solution since some btls do not + * need any fragmentation (sm, vader, self, etc). Remove this loop if this ends up + * being the case. */ while (bytes_remaining > 0) { /* allocate/initialize a fragment */ MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); @@ -700,29 +694,31 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq ompi_rte_abort(-1, NULL); } - assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs)); + memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size); - memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt); - - /* update the read location -- NTH: note this will only work if there is exactly one - segment. TODO -- make this work with multiple segments */ - r_segments = (mca_btl_base_segment_t *) frag->rdma_segs; - r_segments->seg_addr.lval += offset; + /* update the read location */ + frag->remote_address = hdr->hdr_src_ptr + offset; /* updating the write location */ OPAL_THREAD_LOCK(&recvreq->lock); opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset); + opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address); OPAL_THREAD_UNLOCK(&recvreq->lock); frag->rdma_bml = rdma_bml; frag->rdma_hdr.hdr_rget = *hdr; - frag->retries = 0; - frag->rdma_req = recvreq; - frag->rdma_ep = bml_endpoint; - frag->rdma_state = MCA_PML_OB1_RDMA_GET; - frag->reg = NULL; - frag->rdma_length = bytes_remaining; + frag->retries = 0; + frag->rdma_req = recvreq; + frag->rdma_state = MCA_PML_OB1_RDMA_GET; + frag->local_handle = NULL; + frag->rdma_offset = offset; + + if (bytes_remaining > rdma_bml->btl->btl_get_limit) { + frag->rdma_length = rdma_bml->btl->btl_get_limit; + } else { + frag->rdma_length = bytes_remaining; + } /* NTH: TODO -- handle error conditions gracefully */ rc = mca_pml_ob1_recv_request_get_frag(frag); @@ -921,13 +917,11 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq, while(bytes_remaining > 0 && recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) { - size_t size, seg_size; - mca_pml_ob1_rdma_hdr_t* hdr; - mca_btl_base_descriptor_t* dst; - mca_btl_base_descriptor_t* ctl; - mca_mpool_base_registration_t * reg = NULL; - mca_btl_base_module_t* btl; + mca_pml_ob1_rdma_frag_t *frag = NULL; + mca_btl_base_module_t *btl; int rc, rdma_idx; + void *data_ptr; + size_t size; if(prev_bytes_remaining == bytes_remaining) { if(++num_fail == num_tries) { @@ -948,86 +942,62 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq, do { rdma_idx = recvreq->req_rdma_idx; bml_btl = recvreq->req_rdma[rdma_idx].bml_btl; - reg = recvreq->req_rdma[rdma_idx].btl_reg; size = recvreq->req_rdma[rdma_idx].length; if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) recvreq->req_rdma_idx = 0; } while(!size); btl = bml_btl->btl; - /* makes sure that we don't exceed BTL max rdma size - * if memory is not pinned already */ - if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) && - (size > btl->btl_rdma_pipeline_frag_size)) { + /* NTH: This conditional used to check if there was a registration in + * recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to + * the btl not needed registration (equivalent to btl->btl_register_mem + * != NULL. This new check is equivalent. Note: I feel this protocol + * needs work to better improve resource usage when running with a + * leave pinned protocol. */ + if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) && + (size > btl->btl_rdma_pipeline_frag_size)) { size = btl->btl_rdma_pipeline_frag_size; } - /* take lock to protect converter against concurrent access + MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); + if (OPAL_UNLIKELY(NULL == frag)) { + continue; + } + + /* take lock to protect convertor against concurrent access * from unpack */ OPAL_THREAD_LOCK(&recvreq->lock); - opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, - &recvreq->req_rdma_offset ); - - /* prepare a descriptor for RDMA */ - mca_bml_base_prepare_dst(bml_btl, reg, - &recvreq->req_recv.req_base.req_convertor, - MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | - MCA_BTL_DES_FLAGS_PUT, &dst); + opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor, + &recvreq->req_rdma_offset); + opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr); OPAL_THREAD_UNLOCK(&recvreq->lock); - if(OPAL_UNLIKELY(dst == NULL)) { - continue; + if (btl->btl_register_mem) { + mca_bml_base_register_mem (bml_btl, data_ptr, size, MCA_BTL_REG_FLAG_REMOTE_WRITE, + &frag->local_handle); + if (OPAL_UNLIKELY(NULL == frag->local_handle)) { + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); + continue; + } } - dst->des_cbfunc = mca_pml_ob1_put_completion; - dst->des_cbdata = recvreq; + /* fill in the minimum information needed to handle the fin message */ + frag->cbfunc = mca_pml_ob1_put_completion; + frag->rdma_length = size; + frag->rdma_req = recvreq; + frag->rdma_bml = bml_btl; + frag->local_address = data_ptr; + frag->rdma_offset = recvreq->req_rdma_offset; - seg_size = btl->btl_seg_size * dst->des_local_count; - - /* prepare a descriptor for rdma control message */ - mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size, - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | - MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL); - - if( OPAL_UNLIKELY(NULL == ctl) ) { - mca_bml_base_free(bml_btl,dst); - continue; - } - ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion; - - /* fill in rdma header */ - hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_local->seg_addr.pval; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; - hdr->hdr_common.hdr_flags = - (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0; - hdr->hdr_req = recvreq->remote_req_send; - hdr->hdr_des.pval = dst; - hdr->hdr_recv_req.pval = recvreq; - hdr->hdr_rdma_offset = recvreq->req_rdma_offset; - hdr->hdr_seg_cnt = dst->des_local_count; - - /* copy segments */ - memmove (hdr + 1, dst->des_local, seg_size); - - if(!recvreq->req_ack_sent) - recvreq->req_ack_sent = true; - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc); - - PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, - &(recvreq->req_recv.req_base), size, - PERUSE_RECV); - - /* send rdma request to peer */ - rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); - if( OPAL_LIKELY( rc >= 0 ) ) { + rc = mca_pml_ob1_recv_request_put_frag (frag); + if (OPAL_LIKELY(OMPI_SUCCESS == rc)) { /* update request state */ recvreq->req_rdma_offset += size; OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); recvreq->req_rdma[rdma_idx].length -= size; bytes_remaining -= size; } else { - mca_bml_base_free(bml_btl,ctl); - mca_bml_base_free(bml_btl,dst); + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); } } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index e49d5b6013..79a22b752b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -131,7 +132,7 @@ do { \ #define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \ { \ MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \ - OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \ + OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \ (ompi_free_list_item_t*)(recvreq)); \ } @@ -154,9 +155,11 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq) } for(i = 0; i < recvreq->req_rdma_cnt; i++) { - mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg; - if( NULL != btl_reg && btl_reg->mpool != NULL) { - btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); + struct mca_btl_base_registration_handle_t *handle = recvreq->req_rdma[i].btl_reg; + mca_bml_base_btl_t *bml_btl = recvreq->req_rdma[i].bml_btl; + + if (NULL != handle) { + mca_bml_base_deregister_mem (bml_btl, handle); } } recvreq->req_rdma_cnt = 0; @@ -178,6 +181,10 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq) recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = MPI_ERR_TRUNCATE; } + if (OPAL_UNLIKELY(recvreq->local_handle)) { + mca_bml_base_deregister_mem (recvreq->rdma_bml, recvreq->local_handle); + recvreq->local_handle = NULL; + } MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq); } OPAL_THREAD_UNLOCK(&ompi_request_lock); @@ -387,7 +394,7 @@ static inline void mca_pml_ob1_recv_request_schedule( (void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl); } -#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \ +#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \ do { \ mca_pml_ob1_pckt_pending_t *_pckt; \ \ @@ -396,6 +403,7 @@ static inline void mca_pml_ob1_recv_request_schedule( _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \ _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \ _pckt->hdr.hdr_ack.hdr_send_offset = (O); \ + _pckt->hdr.hdr_ack.hdr_send_size = (Sz); \ _pckt->proc = (P); \ _pckt->bml_btl = NULL; \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ @@ -406,11 +414,11 @@ static inline void mca_pml_ob1_recv_request_schedule( int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, - uint64_t hdr_rdma_offset, bool nordma); + uint64_t hdr_rdma_offset, uint64_t size, bool nordma); static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, - bool nordma) + uint64_t size, bool nordma) { size_t i; mca_bml_base_btl_t* bml_btl; @@ -420,12 +428,12 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, - hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS) + hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS) return OMPI_SUCCESS; } MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req, - hdr_send_offset); + hdr_send_offset, size); return OMPI_ERR_OUT_OF_RESOURCE; } diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 86d7dc0dce..b9c6dedc1b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -13,7 +13,7 @@ * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -137,6 +137,7 @@ static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req) req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel; req->req_rdma_cnt = 0; req->req_throttle_sends = false; + req->rdma_frag = NULL; OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t); OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t); } @@ -145,6 +146,10 @@ static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req) { OBJ_DESTRUCT(&req->req_send_ranges); OBJ_DESTRUCT(&req->req_send_range_lock); + if (req->rdma_frag) { + MCA_PML_OB1_RDMA_FRAG_RETURN(req->rdma_frag); + req->rdma_frag = NULL; + } } OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t, @@ -236,10 +241,9 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl, * happens in one thread, the increase of the req_bytes_delivered does not * have to be atomic. */ - req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, - (void *) des->des_local, - des->des_local_count, - sizeof(mca_pml_ob1_rendezvous_hdr_t)); + req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments, + des->des_segment_count, + sizeof(mca_pml_ob1_rendezvous_hdr_t)); mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered ); } @@ -250,27 +254,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl, */ static void -mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status ) +mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length) { - mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; - size_t req_bytes_delivered; + mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; /* count bytes of user data actually delivered and check for request completion */ - if (OPAL_LIKELY(OMPI_SUCCESS == status)) { - req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, - (void *) des->des_local, - des->des_local_count, 0); - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); + if (OPAL_LIKELY(0 < rdma_length)) { + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length); } - sendreq->src_des = NULL; send_request_pml_complete_check(sendreq); - /* free the descriptor */ - mca_bml_base_free(bml_btl, des); + MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } @@ -314,10 +309,9 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl, } /* count bytes of user data actually delivered */ - req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, - (void *) des->des_local, - des->des_local_count, - sizeof(mca_pml_ob1_frag_hdr_t)); + req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments, + des->des_segment_count, + sizeof(mca_pml_ob1_frag_hdr_t)); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); @@ -389,7 +383,7 @@ int mca_pml_ob1_send_request_start_buffered( if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } - segment = des->des_local; + segment = des->des_segments; /* pack the data into the BTL supplied buffer */ iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + @@ -408,17 +402,14 @@ int mca_pml_ob1_send_request_start_buffered( /* build rendezvous header */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; - hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; - hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_rndv.hdr_src_req.pval = sendreq; + mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_send.req_bytes_packed, sendreq); - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc); /* update lengths */ segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data; @@ -491,15 +482,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, if(NULL != bml_btl->btl->btl_sendi) { mca_pml_ob1_match_hdr_t match; - match.hdr_common.hdr_flags = 0; - match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - match.hdr_tag = sendreq->req_send.req_base.req_tag; - match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence); - ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc); /* try to send immediately */ rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor, @@ -532,7 +521,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, return OMPI_ERR_OUT_OF_RESOURCE; } - segment = des->des_local; + segment = des->des_segments; if(size > 0) { /* pack the data into the supplied buffer */ @@ -566,15 +555,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, /* build match header */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence); - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc); /* update lengths */ segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data; @@ -618,7 +605,6 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, /* prepare descriptor */ mca_bml_base_prepare_src( bml_btl, - NULL, &sendreq->req_send.req_base.req_convertor, MCA_BTL_NO_ORDER, OMPI_PML_OB1_MATCH_HDR_LEN, @@ -628,19 +614,17 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } - segment = des->des_local; + segment = des->des_segments; /* build match header */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence); - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc); /* short message */ des->des_cbfunc = mca_pml_ob1_match_completion_free; @@ -674,80 +658,68 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq, * one RDMA capable BTLs). This way round robin distribution of RDMA * operation is achieved. */ - - mca_btl_base_descriptor_t *des, *src = NULL; + mca_btl_base_registration_handle_t *local_handle; + mca_btl_base_descriptor_t *des; + mca_pml_ob1_rdma_frag_t *frag; mca_pml_ob1_rget_hdr_t *hdr; - size_t seg_size; + size_t reg_size; + void *data_ptr; int rc; - sendreq->src_des = NULL; - bml_btl = sendreq->req_rdma[0].bml_btl; if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) { + sendreq->rdma_frag = NULL; /* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */ return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN); } - MEMCHECKER( - memchecker_call(&opal_memchecker_base_mem_defined, - sendreq->req_send.req_base.req_addr, - sendreq->req_send.req_base.req_count, - sendreq->req_send.req_base.req_datatype); - ); - /* prepare source descriptor/segment(s) */ - /* PML owns this descriptor and will free it in */ - /* mca_pml_ob1_rget_completion */ - mca_bml_base_prepare_src( bml_btl, sendreq->req_rdma[0].btl_reg, - &sendreq->req_send.req_base.req_convertor, - MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_GET | - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &src ); - MEMCHECKER( - memchecker_call(&opal_memchecker_base_mem_noaccess, - sendreq->req_send.req_base.req_addr, - sendreq->req_send.req_base.req_count, - sendreq->req_send.req_base.req_datatype); - ); - if( OPAL_UNLIKELY(NULL == src) ) { - return OMPI_ERR_OUT_OF_RESOURCE; + /* at this time ob1 does not support non-contiguous gets. the convertor represents a + * contiguous block of memory */ + opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr); + + local_handle = sendreq->req_rdma[0].btl_reg; + + /* allocate an rdma fragment to keep track of the request size for use in the fin message */ + MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); + if (OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; } - - src->des_cbfunc = mca_pml_ob1_rget_completion; - src->des_cbdata = sendreq; - sendreq->src_des = src; + /* fill in necessary fragment data */ + frag->rdma_req = sendreq; + frag->rdma_bml = bml_btl; + frag->rdma_length = size; + frag->cbfunc = mca_pml_ob1_rget_completion; + /* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */ - seg_size = bml_btl->btl->btl_seg_size * src->des_local_count; + /* save the fragment for get->put fallback */ + sendreq->rdma_frag = frag; + + reg_size = bml_btl->btl->btl_registration_handle_size; /* allocate space for get hdr + segment list */ - mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size, + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL); if( OPAL_UNLIKELY(NULL == des) ) { /* NTH: no need to reset the converter here. it will be reset before it is retried */ - mca_bml_base_free(bml_btl, src); return OMPI_ERR_OUT_OF_RESOURCE; } /* build match header */ - hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval; - - hdr->hdr_rndv.hdr_match.hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN; - hdr->hdr_rndv.hdr_match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET; - hdr->hdr_rndv.hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_rndv.hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_rndv.hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_rndv.hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; - hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_rndv.hdr_src_req.pval = sendreq; - hdr->hdr_des.pval = src; - hdr->hdr_seg_cnt = src->des_local_count; + hdr = (mca_pml_ob1_rget_hdr_t *) des->des_segments->seg_addr.pval; + /* TODO -- Add support for multiple segments for get */ + mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_send.req_bytes_packed, sendreq, + frag, data_ptr, local_handle, reg_size); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc); - /* copy segment data */ - memcpy (hdr + 1, src->des_local, seg_size); - des->des_cbfunc = mca_pml_ob1_send_ctl_completion; des->des_cbdata = sendreq; @@ -765,12 +737,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq, rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET); if (OPAL_UNLIKELY(rc < 0)) { mca_bml_base_free(bml_btl, des); - - if (sendreq->src_des) { - mca_bml_base_free (bml_btl, sendreq->src_des); - sendreq->src_des = NULL; - } - return rc; } @@ -808,7 +774,6 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq, sendreq->req_send.req_base.req_datatype); ); mca_bml_base_prepare_src( bml_btl, - NULL, &sendreq->req_send.req_base.req_convertor, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rendezvous_hdr_t), @@ -827,21 +792,19 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq, if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } - segment = des->des_local; + segment = des->des_segments; /* build hdr */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = flags | MCA_PML_OB1_HDR_FLAGS_SIGNAL; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; - hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; - hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_rndv.hdr_src_req.pval = sendreq; + mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags | + MCA_PML_OB1_HDR_FLAGS_SIGNAL, + sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank, + sendreq->req_send.req_base.req_tag, + (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_send.req_bytes_packed, sendreq); - ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, - sendreq->req_send.req_base.req_proc); + ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc); /* first fragment of a long message */ des->des_cbdata = sendreq; @@ -1022,13 +985,10 @@ cannot_pack: sendreq->req_send.req_base.req_count, sendreq->req_send.req_base.req_datatype); ); - mca_bml_base_prepare_src(bml_btl, NULL, - &sendreq->req_send.req_base.req_convertor, - MCA_BTL_NO_ORDER, - sizeof(mca_pml_ob1_frag_hdr_t), + mca_bml_base_prepare_src(bml_btl, &sendreq->req_send.req_base.req_convertor, + MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t), &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK | - MCA_BTL_DES_FLAGS_SIGNAL, - &des); + MCA_BTL_DES_FLAGS_SIGNAL, &des); MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_noaccess, sendreq->req_send.req_base.req_addr, @@ -1051,12 +1011,9 @@ cannot_pack: des->des_cbdata = sendreq; /* setup header */ - hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval; - hdr->hdr_common.hdr_flags = 0; - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG; - hdr->hdr_frag_offset = range->range_send_offset; - hdr->hdr_src_req.pval = sendreq; - hdr->hdr_dst_req = sendreq->req_recv; + hdr = (mca_pml_ob1_frag_hdr_t*)des->des_segments->seg_addr.pval; + mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq, + sendreq->req_recv.lval); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG, sendreq->req_send.req_base.req_proc); @@ -1113,38 +1070,66 @@ cannot_pack: } +/** + * A put fragment could not be started. Queue the fragment to be retried later or + * fall back on send/recv. + */ +static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc) +{ + mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; + + if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { + /* queue the frag for later if there was a resource error */ + OPAL_THREAD_LOCK(&mca_pml_ob1.lock); + opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); + } else { + /* tell receiver to deregister memory */ + mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, + frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, + OPAL_ERR_TEMP_OUT_OF_RESOURCE); + + /* send fragment by copy in/out */ + mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, + frag->rdma_length); + /* if a pointer to a receive request is not set it means that + * ACK was not yet received. Don't schedule sends before ACK */ + if (NULL != sendreq->req_recv.pval) + mca_pml_ob1_send_request_schedule (sendreq); + } +} + /** * An RDMA put operation has completed: * (1) Update request status and if required set completed - * (2) Send FIN control message to the destination + * (2) Send FIN control message to the destination */ -static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status ) +static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status) { - mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; - mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req; - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata; + mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; + mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context; /* check completion status */ - if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { - /* TSW - FIX */ - OMPI_ERROR_LOG(status); - ompi_rte_abort(-1, NULL); + if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) { + /* TODO -- readd ordering */ + mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, + frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length, + 0, 0); + + /* check for request completion */ + OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); + + send_request_pml_complete_check(sendreq); + } else { + /* try to fall back on send/recv */ + mca_pml_ob1_send_request_put_frag_failed (frag, status); } - mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc, - bml_btl, - frag->rdma_hdr.hdr_rdma.hdr_des, - des->order, 0); - - /* check for request completion */ - OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); - - send_request_pml_complete_check(sendreq); - MCA_PML_OB1_RDMA_FRAG_RETURN(frag); MCA_PML_OB1_PROGRESS_PENDING(bml_btl); @@ -1152,81 +1137,45 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag ) { - mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req; - mca_mpool_base_registration_t *reg = NULL; + mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; + mca_btl_base_registration_handle_t *local_handle = NULL; mca_bml_base_btl_t *bml_btl = frag->rdma_bml; - mca_btl_base_descriptor_t *des; - size_t save_size = frag->rdma_length; int rc; - if (OPAL_LIKELY(NULL == sendreq->src_des)) { - /* setup descriptor */ - mca_bml_base_prepare_src( bml_btl, - reg, - &frag->convertor, - MCA_BTL_NO_ORDER, - 0, - &frag->rdma_length, - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | - MCA_BTL_DES_FLAGS_PUT, - &des ); - - if( OPAL_UNLIKELY(NULL == des) ) { - if(frag->retries < mca_pml_ob1.rdma_retries_limit) { - size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset; - frag->rdma_length = save_size; - opal_convertor_set_position(&frag->convertor, &offset); - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); - opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - } else { - mca_pml_ob1_send_request_t *sendreq = - (mca_pml_ob1_send_request_t*)frag->rdma_req; + if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) { + /* Check if the segment is already registered */ + for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) { + if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) { + /* do not copy the handle to the fragment to avoid deregistring it twice */ + local_handle = sendreq->req_rdma[i].btl_reg; + break; + } + } - /* tell receiver to unregister memory */ - mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc, - bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des, - MCA_BTL_NO_ORDER, 1); + if (NULL == frag->local_handle) { + /* Not already registered. Register the region with the BTL. */ + mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0, + &frag->local_handle); - /* send fragment by copy in/out */ - mca_pml_ob1_send_request_copy_in_out(sendreq, - frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length); - /* if a pointer to a receive request is not set it means that - * ACK was not yet received. Don't schedule sends before ACK */ - if(NULL != sendreq->req_recv.pval) - mca_pml_ob1_send_request_schedule(sendreq); + if (OPAL_UNLIKELY(NULL == frag->local_handle)) { + mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE); + + return OMPI_ERR_OUT_OF_RESOURCE; } - return OMPI_ERR_OUT_OF_RESOURCE; + local_handle = frag->local_handle; } - } else { - /* already have a source descriptor */ - des = sendreq->src_des; - sendreq->src_des = NULL; } - des->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs; - des->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; - des->des_cbfunc = mca_pml_ob1_put_completion; - des->des_cbdata = frag; - PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, &(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND ); - rc = mca_bml_base_put(bml_btl, des); + rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle, + (mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length, + 0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - mca_bml_base_free(bml_btl, des); - frag->rdma_length = save_size; - if(OMPI_ERR_OUT_OF_RESOURCE == rc) { - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); - opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); - return OMPI_ERR_OUT_OF_RESOURCE; - } else { - /* TSW - FIX */ - OMPI_ERROR_LOG(rc); - ompi_rte_abort(-1, NULL); - } + mca_pml_ob1_send_request_put_frag_failed (frag, rc); + return rc; } return OMPI_SUCCESS; @@ -1240,12 +1189,11 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag ) */ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq, - mca_btl_base_module_t* btl, + mca_btl_base_module_t* btl, mca_pml_ob1_rdma_hdr_t* hdr ) { mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint; mca_pml_ob1_rdma_frag_t* frag; - size_t i, size = 0; if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) { OPAL_THREAD_ADD32(&sendreq->req_state, -1); @@ -1253,61 +1201,36 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq, sendreq->req_recv.pval = hdr->hdr_recv_req.pval; - MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); + if (NULL == sendreq->rdma_frag) { + MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); - if( OPAL_UNLIKELY(NULL == frag) ) { - /* TSW - FIX */ - OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); - ompi_rte_abort(-1, NULL); - } - - assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs)); - - /* setup fragment */ - memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt); - - for( i = 0; i < hdr->hdr_seg_cnt; i++ ) { - mca_btl_base_segment_t *seg = (mca_btl_base_segment_t *) ((uintptr_t)(frag->rdma_segs) + i * btl->btl_seg_size); - -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if ((sendreq->req_send.req_base.req_proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN) != - (ompi_proc_local()->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - size += opal_swap_bytes4(seg->seg_len); - } else -#endif - { - size += seg->seg_len; + if( OPAL_UNLIKELY(NULL == frag) ) { + /* TSW - FIX */ + OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); + ompi_rte_abort(-1, NULL); } + } else { + /* rget fallback on put */ + frag = sendreq->rdma_frag; + sendreq->rdma_frag = NULL; + sendreq->req_state = 0; } + /* copy registration data */ + memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size); + frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); frag->rdma_hdr.hdr_rdma = *hdr; frag->rdma_req = sendreq; - frag->rdma_ep = bml_endpoint; - frag->rdma_length = size; + frag->rdma_length = hdr->hdr_dst_size; frag->rdma_state = MCA_PML_OB1_RDMA_PUT; - frag->reg = NULL; + frag->remote_address = hdr->hdr_dst_ptr; frag->retries = 0; - if (OPAL_UNLIKELY(NULL != sendreq->src_des)) { - /* get fallback path */ - sendreq->req_state = 0; - } - - /* lookup the corresponding registration */ - for(i=0; ireq_rdma_cnt; i++) { - if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) { - frag->reg = sendreq->req_rdma[i].btl_reg; - break; - } - } - - /* RDMA writes may proceed in parallel to send and to each other, so - * create clone of the convertor for each RDMA fragment - */ - size = hdr->hdr_rdma_offset; - opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor, - &frag->convertor, 0, &size); + /* Get the address of the current offset. Note: at this time ob1 CAN NOT handle + * non-contiguous RDMA. If that changes this code will be wrong. */ + opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor, + hdr->hdr_rdma_offset, &frag->local_address); mca_pml_ob1_send_request_put_frag(frag); } diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index ff9a85381f..d78845538f 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -54,7 +54,7 @@ struct mca_pml_ob1_send_request_t { mca_pml_ob1_send_pending_t req_pending; opal_mutex_t req_send_range_lock; opal_list_t req_send_ranges; - mca_btl_base_descriptor_t *src_des; + mca_pml_ob1_rdma_frag_t *rdma_frag; mca_pml_ob1_com_btl_t req_rdma[1]; }; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; @@ -124,10 +124,9 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type) ompi_free_list_item_t* item; \ \ if( OPAL_LIKELY(NULL != proc) ) { \ - OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \ + OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \ sendreq = (mca_pml_ob1_send_request_t*)item; \ sendreq->req_send.req_base.req_proc = proc; \ - sendreq->src_des = NULL; \ } \ } @@ -163,15 +162,18 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type) assert( 0 == _position ); \ } -static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* sendreq) +static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq) { size_t r; /* return mpool resources */ for(r = 0; r < sendreq->req_rdma_cnt; r++) { - mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; - if( NULL != reg && reg->mpool != NULL ) { - reg->mpool->mpool_deregister(reg->mpool, reg); + struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg; + mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl; + + if (NULL != handle) { + mca_bml_base_deregister_mem (bml_btl, handle); + sendreq->req_rdma[r].btl_reg = NULL; } } sendreq->req_rdma_cnt = 0; @@ -218,10 +220,14 @@ do { #define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \ do { \ - /* Let the base handle the reference counts */ \ - MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ - OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \ - (ompi_free_list_item_t*)sendreq); \ + /* Let the base handle the reference counts */ \ + MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ + if (sendreq->rdma_frag) { \ + MCA_PML_OB1_RDMA_FRAG_RETURN (sendreq->rdma_frag); \ + sendreq->rdma_frag = NULL; \ + } \ + OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \ + (ompi_free_list_item_t*)sendreq); \ } while(0)