From f274f524abaf85da9dc649368576ccfcfee3003e Mon Sep 17 00:00:00 2001 From: Tim Woodall Date: Wed, 17 Aug 2005 18:23:38 +0000 Subject: [PATCH] - added get based protocol (if supported by btl) for pre-registered memory - removed 8 bytes from the majority of the pml headers This commit was SVN r6916. --- ompi/mca/bml/bml.h | 9 + ompi/mca/bml/r2/bml_r2.c | 2 +- ompi/mca/btl/btl.h | 3 +- ompi/mca/btl/gm/btl_gm_component.c | 2 +- ompi/mca/btl/mvapi/btl_mvapi.c | 41 ++- ompi/mca/btl/mvapi/btl_mvapi.h | 18 ++ ompi/mca/btl/mvapi/btl_mvapi_component.c | 4 +- ompi/mca/btl/mx/btl_mx_component.c | 2 +- ompi/mca/btl/openib/btl_openib_component.c | 2 +- ompi/mca/btl/portals/btl_portals_component.c | 2 +- ompi/mca/btl/self/btl_self_component.c | 2 +- ompi/mca/btl/tcp/btl_tcp_component.c | 2 +- .../mca/btl/template/btl_template_component.c | 2 +- ompi/mca/mpool/mvapi/mpool_mvapi_module.c | 2 +- ompi/mca/pml/ob1/pml_ob1_hdr.h | 74 ++--- ompi/mca/pml/ob1/pml_ob1_rdmafrag.h | 3 +- ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 41 +-- ompi/mca/pml/ob1/pml_ob1_recvfrag.h | 4 +- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 259 +++++++++++++--- ompi/mca/pml/ob1/pml_ob1_recvreq.h | 29 +- ompi/mca/pml/ob1/pml_ob1_sendreq.c | 291 ++++++++++-------- ompi/mca/pml/ob1/pml_ob1_sendreq.h | 61 +--- 22 files changed, 520 insertions(+), 335 deletions(-) diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index c0b935df9c..5a24cc1c05 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -247,6 +247,15 @@ static inline int mca_bml_base_put(mca_bml_base_btl_t* bml_btl, mca_btl_base_des des); } +static inline int mca_bml_base_get(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des) { + des->des_context = (void*) bml_btl; + return bml_btl->btl_get( + bml_btl->btl, + bml_btl->btl_endpoint, + des); +} + + static inline void mca_bml_base_prepare_src(mca_bml_base_btl_t* bml_btl, mca_mpool_base_registration_t* reg, struct ompi_convertor_t* conv, diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index 617127f325..766dfed3b3 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -353,7 +353,7 @@ int mca_bml_r2_add_procs( } /* check flags - is rdma prefered */ - if(btl->btl_flags & MCA_BTL_FLAGS_RDMA && + if(btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET) && proc->proc_arch == ompi_proc_local_proc->proc_arch) { mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma); *bml_btl_rdma = *bml_btl; diff --git a/ompi/mca/btl/btl.h b/ompi/mca/btl/btl.h index 7bdfae407a..a0e4440a04 100644 --- a/ompi/mca/btl/btl.h +++ b/ompi/mca/btl/btl.h @@ -134,7 +134,8 @@ typedef uint8_t mca_btl_base_tag_t; /* prefered protocol */ #define MCA_BTL_FLAGS_SEND 0x1 -#define MCA_BTL_FLAGS_RDMA 0x2 +#define MCA_BTL_FLAGS_PUT 0x2 +#define MCA_BTL_FLAGS_GET 0x4 /* btl can send directly from user buffer w/out registration */ #define MCA_BTL_FLAGS_SEND_INPLACE 0x10000000 diff --git a/ompi/mca/btl/gm/btl_gm_component.c b/ompi/mca/btl/gm/btl_gm_component.c index 15892d70be..ce17b47d7b 100644 --- a/ompi/mca/btl/gm/btl_gm_component.c +++ b/ompi/mca/btl/gm/btl_gm_component.c @@ -151,7 +151,7 @@ int mca_btl_gm_component_open(void) mca_btl_gm_param_register_int("max_rdma_size", 128*1024); #if OMPI_MCA_BTL_GM_SUPPORT_REGISTERING && OMPI_MCA_BTL_GM_HAVE_RDMA_PUT mca_btl_gm_module.super.btl_flags = - mca_btl_gm_param_register_int("flags", MCA_BTL_FLAGS_RDMA); + mca_btl_gm_param_register_int("flags", MCA_BTL_FLAGS_PUT); #else mca_btl_gm_module.super.btl_flags = MCA_BTL_FLAGS_SEND; #endif diff --git a/ompi/mca/btl/mvapi/btl_mvapi.c b/ompi/mca/btl/mvapi/btl_mvapi.c index 555b2defe7..14eb106f74 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.c +++ b/ompi/mca/btl/mvapi/btl_mvapi.c @@ -55,7 +55,7 @@ mca_btl_mvapi_module_t mca_btl_mvapi_module = { mca_btl_mvapi_prepare_dst, mca_btl_mvapi_send, mca_btl_mvapi_put, - NULL /* get */ + mca_btl_mvapi_get } }; @@ -770,8 +770,8 @@ int mca_btl_mvapi_send( */ int mca_btl_mvapi_put( mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* endpoint, - mca_btl_base_descriptor_t* descriptor) + mca_btl_base_endpoint_t* endpoint, + mca_btl_base_descriptor_t* descriptor) { mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*) btl; mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*) descriptor; @@ -801,7 +801,42 @@ int mca_btl_mvapi_put( mca_btl_base_module_t* btl, } +/* + * RDMA read remote buffer to local buffer address. + */ +int mca_btl_mvapi_get( mca_btl_base_module_t* btl, + mca_btl_base_endpoint_t* endpoint, + mca_btl_base_descriptor_t* descriptor) +{ + mca_btl_mvapi_module_t* mvapi_btl = (mca_btl_mvapi_module_t*) btl; + mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*) descriptor; + frag->endpoint = endpoint; + frag->sr_desc.opcode = VAPI_RDMA_READ; + + frag->sr_desc.remote_qp = endpoint->rem_qp_num_low; + frag->sr_desc.remote_addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->base.des_src->seg_addr.pval; + frag->sr_desc.r_key = frag->base.des_src->seg_key.key32[0]; + frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->base.des_dst->seg_addr.pval; + frag->sg_entry.len = frag->base.des_dst->seg_len; + + frag->ret = VAPI_post_sr(mvapi_btl->nic, + endpoint->lcl_qp_hndl_low, + &frag->sr_desc); + if(VAPI_OK != frag->ret){ + return OMPI_ERROR; + } + if(mca_btl_mvapi_component.use_srq) { + MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 1); + MCA_BTL_MVAPI_POST_SRR_LOW(mvapi_btl, 1); + } else { + MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 1); + MCA_BTL_MVAPI_ENDPOINT_POST_RR_LOW(endpoint, 1); + } + return OMPI_SUCCESS; + +} + /* * Asynchronous event handler to detect unforseen diff --git a/ompi/mca/btl/mvapi/btl_mvapi.h b/ompi/mca/btl/mvapi/btl_mvapi.h index 7290237f7b..6ff15efee8 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi.h +++ b/ompi/mca/btl/mvapi/btl_mvapi.h @@ -395,6 +395,24 @@ extern int mca_btl_mvapi_put( struct mca_btl_base_descriptor_t* decriptor ); + +/** + * PML->BTL Initiate a get of the specified size. + * + * @param btl (IN) BTL instance + * @param btl_base_peer (IN) BTL peer addressing + * @param send_request (IN/OUT) Send request (allocated by PML via mca_btl_base_request_alloc_fn_t) + * @param size (IN) Number of bytes PML is requesting BTL to deliver + * @param flags (IN) Flags that should be passed to the peer via the message header. + * @param request (OUT) OMPI_SUCCESS if the BTL was able to queue one or more fragments + */ +extern int mca_btl_mvapi_get( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* btl_peer, + struct mca_btl_base_descriptor_t* decriptor +); + + /** * Allocate a descriptor. * diff --git a/ompi/mca/btl/mvapi/btl_mvapi_component.c b/ompi/mca/btl/mvapi/btl_mvapi_component.c index 87575293a5..1a5a7f22c6 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_component.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_component.c @@ -201,7 +201,7 @@ int mca_btl_mvapi_component_open(void) 1024*1024); mca_btl_mvapi_module.super.btl_flags = mca_btl_mvapi_param_register_int("flags", - MCA_BTL_FLAGS_RDMA); + MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET); param = mca_base_param_find("mpi", NULL, "leave_pinned"); @@ -574,6 +574,7 @@ int mca_btl_mvapi_component_progress() BTL_ERROR(("Got an RDMA with Immediate data!, not supported!")); return OMPI_ERROR; + case VAPI_CQE_SQ_RDMA_READ: case VAPI_CQE_SQ_RDMA_WRITE: case VAPI_CQE_SQ_SEND_DATA : @@ -632,6 +633,7 @@ int mca_btl_mvapi_component_progress() BTL_ERROR(("Got an RDMA with Immediate data!, not supported!")); return OMPI_ERROR; + case VAPI_CQE_SQ_RDMA_READ: case VAPI_CQE_SQ_RDMA_WRITE: case VAPI_CQE_SQ_SEND_DATA : diff --git a/ompi/mca/btl/mx/btl_mx_component.c b/ompi/mca/btl/mx/btl_mx_component.c index ce4c35447f..571cf009b7 100644 --- a/ompi/mca/btl/mx/btl_mx_component.c +++ b/ompi/mca/btl/mx/btl_mx_component.c @@ -126,7 +126,7 @@ int mca_btl_mx_component_open(void) mca_btl_mx_module.super.btl_max_rdma_size = mca_btl_mx_param_register_int("max_rdma_size", 1024*1024); mca_btl_mx_module.super.btl_flags = - mca_btl_mx_param_register_int("flags", MCA_BTL_FLAGS_RDMA); + mca_btl_mx_param_register_int("flags", MCA_BTL_FLAGS_PUT); return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index c3bf60c8a8..5b43112646 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -204,7 +204,7 @@ int mca_btl_openib_component_open(void) 1024*1024); mca_btl_openib_module.super.btl_flags = mca_btl_openib_param_register_int("flags", - MCA_BTL_FLAGS_RDMA); + MCA_BTL_FLAGS_PUT); param = mca_base_param_find("mpi", NULL, "leave_pinned"); diff --git a/ompi/mca/btl/portals/btl_portals_component.c b/ompi/mca/btl/portals/btl_portals_component.c index eaea09e187..37e962592e 100644 --- a/ompi/mca/btl/portals/btl_portals_component.c +++ b/ompi/mca/btl/portals/btl_portals_component.c @@ -202,7 +202,7 @@ mca_btl_portals_component_open(void) &dummy); mca_btl_portals_module.super.btl_bandwidth = dummy; - mca_btl_portals_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE; + mca_btl_portals_module.super.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE; mca_btl_portals_module.portals_num_procs = 0; bzero(&(mca_btl_portals_module.portals_reg), diff --git a/ompi/mca/btl/self/btl_self_component.c b/ompi/mca/btl/self/btl_self_component.c index 07df87d78d..cc8c850dd0 100644 --- a/ompi/mca/btl/self/btl_self_component.c +++ b/ompi/mca/btl/self/btl_self_component.c @@ -119,7 +119,7 @@ int mca_btl_self_component_open(void) mca_btl_self.btl_exclusivity = mca_btl_self_param_register_int("exclusivity", 64*1024); mca_btl_self.btl_flags = - mca_btl_self_param_register_int("flags", MCA_BTL_FLAGS_RDMA); + mca_btl_self_param_register_int("flags", MCA_BTL_FLAGS_PUT); /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_self_component.self_lock, opal_mutex_t); diff --git a/ompi/mca/btl/tcp/btl_tcp_component.c b/ompi/mca/btl/tcp/btl_tcp_component.c index 6853ae102a..e2595c174a 100644 --- a/ompi/mca/btl/tcp/btl_tcp_component.c +++ b/ompi/mca/btl/tcp/btl_tcp_component.c @@ -218,7 +218,7 @@ int mca_btl_tcp_component_open(void) mca_btl_tcp_module.super.btl_max_rdma_size = mca_btl_tcp_param_register_int("max_rdma_size", INT_MAX); mca_btl_tcp_module.super.btl_flags = - mca_btl_tcp_param_register_int("flags", MCA_BTL_FLAGS_RDMA|MCA_BTL_FLAGS_SEND_INPLACE); + mca_btl_tcp_param_register_int("flags", MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_SEND_INPLACE); return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/template/btl_template_component.c b/ompi/mca/btl/template/btl_template_component.c index fa82d639c7..1aa3039299 100644 --- a/ompi/mca/btl/template/btl_template_component.c +++ b/ompi/mca/btl/template/btl_template_component.c @@ -128,7 +128,7 @@ int mca_btl_template_component_open(void) mca_btl_template_module.super.btl_max_rdma_size = mca_btl_template_param_register_int("max_rdma_size", 1024*1024); mca_btl_template_module.super.btl_flags = - mca_btl_template_param_register_int("flags", MCA_BTL_FLAGS_RDMA); + mca_btl_template_param_register_int("flags", MCA_BTL_FLAGS_PUT); return OMPI_SUCCESS; } diff --git a/ompi/mca/mpool/mvapi/mpool_mvapi_module.c b/ompi/mca/mpool/mvapi/mpool_mvapi_module.c index b8edd4b1e0..2dc69c8c9d 100644 --- a/ompi/mca/mpool/mvapi/mpool_mvapi_module.c +++ b/ompi/mca/mpool/mvapi/mpool_mvapi_module.c @@ -82,7 +82,7 @@ int mca_mpool_mvapi_register(mca_mpool_base_module_t* mpool, vapi_reg->hndl = VAPI_INVAL_HNDL; - mr_in.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE; + mr_in.acl = VAPI_EN_LOCAL_WRITE | VAPI_EN_REMOTE_WRITE | VAPI_EN_REMOTE_READ; mr_in.l_key = 0; mr_in.r_key = 0; mr_in.pd_hndl = mpool_module->hca_pd.pd_tag; diff --git a/ompi/mca/pml/ob1/pml_ob1_hdr.h b/ompi/mca/pml/ob1/pml_ob1_hdr.h index b56e13837d..cb562bf539 100644 --- a/ompi/mca/pml/ob1/pml_ob1_hdr.h +++ b/ompi/mca/pml/ob1/pml_ob1_hdr.h @@ -20,7 +20,6 @@ #define MCA_PML_OB1_HEADER_H #include "ompi_config.h" -#include "mca/ptl/ptl.h" #ifdef HAVE_SYS_TYPES_H #include #endif @@ -28,15 +27,16 @@ #include #endif -#define MCA_PML_OB1_HDR_TYPE_MATCH 1 -#define MCA_PML_OB1_HDR_TYPE_RNDV 2 -#define MCA_PML_OB1_HDR_TYPE_ACK 3 -#define MCA_PML_OB1_HDR_TYPE_NACK 4 -#define MCA_PML_OB1_HDR_TYPE_FRAG 5 -#define MCA_PML_OB1_HDR_TYPE_GET 6 -#define MCA_PML_OB1_HDR_TYPE_PUT 7 -#define MCA_PML_OB1_HDR_TYPE_FIN 8 -#define MCA_PML_OB1_HDR_TYPE_MAX 9 +#define MCA_PML_OB1_HDR_TYPE_MATCH 1 +#define MCA_PML_OB1_HDR_TYPE_RNDV 2 +#define MCA_PML_OB1_HDR_TYPE_RGET 3 +#define MCA_PML_OB1_HDR_TYPE_ACK 4 +#define MCA_PML_OB1_HDR_TYPE_NACK 5 +#define MCA_PML_OB1_HDR_TYPE_FRAG 6 +#define MCA_PML_OB1_HDR_TYPE_GET 7 +#define MCA_PML_OB1_HDR_TYPE_PUT 8 +#define MCA_PML_OB1_HDR_TYPE_FIN 9 +#define MCA_PML_OB1_HDR_TYPE_MAX 10 #define MCA_PML_OB1_HDR_FLAGS_ACK 1 /* is an ack required */ #define MCA_PML_OB1_HDR_FLAGS_NBO 2 /* is the hdr in network byte order */ @@ -93,7 +93,6 @@ struct mca_pml_ob1_common_hdr_t { typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t; #define MCA_PML_OB1_COMMON_HDR_NTOH(h) - #define MCA_PML_OB1_COMMON_HDR_HTON(h) /** @@ -102,35 +101,32 @@ typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t; */ struct mca_pml_ob1_match_hdr_t { mca_pml_ob1_common_hdr_t hdr_common; /**< common attributes */ - uint16_t hdr_contextid; /**< communicator index */ - int32_t hdr_src; /**< source rank */ - int32_t hdr_dst; /**< destination rank */ - int32_t hdr_tag; /**< user tag */ - uint64_t hdr_msg_length; /**< message length */ - uint16_t hdr_msg_seq; /**< message sequence number */ + uint16_t hdr_ctx; /**< communicator index */ + int32_t hdr_src; /**< source rank */ + int32_t hdr_dst; /**< destination rank */ + int32_t hdr_tag; /**< user tag */ + uint16_t hdr_seq; /**< message sequence number */ }; typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t; #define MCA_PML_OB1_MATCH_HDR_NTOH(h) \ do { \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ - (h).hdr_contextid = ntohs((h).hdr_contextid); \ + (h).hdr_ctx = ntohs((h).hdr_ctx); \ (h).hdr_src = ntohl((h).hdr_src); \ (h).hdr_dst = ntohl((h).hdr_dst); \ (h).hdr_tag = ntohl((h).hdr_tag); \ - (h).hdr_msg_length = ntoh64((h).hdr_msg_length); \ - (h).hdr_msg_seq = ntohs((h).hdr_msg_seq); \ + (h).hdr_seq = ntohs((h).hdr_seq); \ } while (0) #define MCA_PML_OB1_MATCH_HDR_HTON(h) \ do { \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - (h).hdr_contextid = htons((h).hdr_contextid); \ + (h).hdr_ctx = htons((h).hdr_ctx); \ (h).hdr_src = htonl((h).hdr_src); \ (h).hdr_dst = htonl((h).hdr_dst); \ (h).hdr_tag = htonl((h).hdr_tag); \ - (h).hdr_msg_length = hton64((h).hdr_msg_length); \ - (h).hdr_msg_seq = htons((h).hdr_msg_seq); \ + (h).hdr_seq = htons((h).hdr_seq); \ } while (0) /** @@ -140,7 +136,7 @@ typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t; */ struct mca_pml_ob1_rendezvous_hdr_t { mca_pml_ob1_match_hdr_t hdr_match; - uint64_t hdr_frag_length; /**< fragment length */ + uint64_t hdr_msg_length; /**< message length */ ompi_ptr_t hdr_src_req; /**< pointer to source request - returned in ack */ }; typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; @@ -148,21 +144,31 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; #define MCA_PML_OB1_RNDV_HDR_NTOH(h) \ do { \ MCA_PML_OB1_MATCH_HDR_NTOH((h).hdr_match); \ - (h).hdr_frag_length = ntoh64((h).hdr_frag_length); \ + (h).hdr_msg_length = ntoh64((h).hdr_msg_length); \ } while (0) #define MCA_PML_OB1_RNDV_HDR_HTON(h) \ do { \ MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \ - (h).hdr_frag_length = hton64((h).hdr_frag_length); \ + (h).hdr_msg_length = hton64((h).hdr_msg_length); \ } while (0) +/** + * Header definition for a combined rdma rendezvous/get + */ +struct mca_pml_ob1_rget_hdr_t { + mca_pml_ob1_rendezvous_hdr_t hdr_rndv; + ompi_ptr_t hdr_des; /**< source descriptor */ + uint32_t hdr_seg_cnt; /**< number of segments for rdma */ + mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ +}; +typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t; + /** * Header for subsequent fragments. */ struct mca_pml_ob1_frag_hdr_t { - mca_pml_ob1_common_hdr_t hdr_common; /**< common attributes */ - uint64_t hdr_frag_length; /**< fragment length */ + mca_pml_ob1_common_hdr_t hdr_common; /**< common attributes */ uint64_t hdr_frag_offset; /**< offset into message */ ompi_ptr_t hdr_src_req; /**< pointer to source request */ ompi_ptr_t hdr_dst_req; /**< pointer to matched receive */ @@ -172,14 +178,12 @@ typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t; #define MCA_PML_OB1_FRAG_HDR_NTOH(h) \ do { \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ - (h).hdr_frag_length = ntoh64((h).hdr_frag_length); \ (h).hdr_frag_offset = ntoh64((h).hdr_frag_offset); \ } while (0) #define MCA_PML_OB1_FRAG_HDR_HTON(h) \ do { \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ - (h).hdr_frag_length = hton64((h).hdr_frag_length); \ (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \ } while (0) @@ -214,8 +218,8 @@ typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t; struct mca_pml_ob1_rdma_hdr_t { mca_pml_ob1_common_hdr_t hdr_common; /**< common attributes */ - ompi_ptr_t hdr_src; /**< source request/descriptor */ - ompi_ptr_t hdr_dst; /**< receive request/descriptor */ + ompi_ptr_t hdr_req; /**< destination request */ + ompi_ptr_t hdr_des; /**< source descriptor */ uint64_t hdr_rdma_offset; /**< current offset into user buffer */ uint32_t hdr_seg_cnt; /**< number of segments for rdma */ mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ @@ -228,10 +232,7 @@ typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t; struct mca_pml_ob1_fin_hdr_t { mca_pml_ob1_common_hdr_t hdr_common; /**< common attributes */ - ompi_ptr_t hdr_src; /**< source request/descriptor */ - ompi_ptr_t hdr_dst; /**< receive request/descriptor */ - uint64_t hdr_rdma_offset; /**< data offset */ - uint64_t hdr_rdma_length; /**< number of segments for rdma */ + ompi_ptr_t hdr_des; /**< completed descriptor */ }; typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t; @@ -242,6 +243,7 @@ union mca_pml_ob1_hdr_t { mca_pml_ob1_common_hdr_t hdr_common; mca_pml_ob1_match_hdr_t hdr_match; mca_pml_ob1_rendezvous_hdr_t hdr_rndv; + mca_pml_ob1_rget_hdr_t hdr_rget; mca_pml_ob1_frag_hdr_t hdr_frag; mca_pml_ob1_ack_hdr_t hdr_ack; mca_pml_ob1_rdma_hdr_t hdr_rdma; diff --git a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h index 005e7ee469..6a7dc57feb 100644 --- a/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h +++ b/ompi/mca/pml/ob1/pml_ob1_rdmafrag.h @@ -27,6 +27,7 @@ typedef enum { MCA_PML_OB1_RMDA_INIT, MCA_PML_OB1_RDMA_PREPARE, MCA_PML_OB1_RDMA_PUT, + MCA_PML_OB1_RDMA_GET, MCA_PML_OB1_RDMA_FIN } mca_pml_ob1_rdma_state_t; @@ -37,7 +38,7 @@ struct mca_pml_ob1_rdma_frag_t { mca_pml_ob1_rdma_state_t rdma_state; size_t rdma_length; mca_btl_base_segment_t rdma_segs[MCA_BTL_DES_MAX_SEGMENTS]; - struct mca_pml_ob1_send_request_t* rdma_req; + void *rdma_req; struct mca_bml_base_endpoint_t* rdma_ep; }; typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index ea3c7871cb..3d079737b6 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -69,6 +69,7 @@ void mca_pml_ob1_recv_frag_callback( switch(hdr->hdr_common.hdr_type) { case MCA_PML_OB1_HDR_TYPE_MATCH: case MCA_PML_OB1_HDR_TYPE_RNDV: + case MCA_PML_OB1_HDR_TYPE_RGET: { mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,des->des_dst_cnt); break; @@ -79,9 +80,6 @@ void mca_pml_ob1_recv_frag_callback( hdr->hdr_ack.hdr_src_req.pval; sendreq->req_recv = hdr->hdr_ack.hdr_dst_req; sendreq->req_rdma_offset = hdr->hdr_ack.hdr_rdma_offset; -#if MCA_PML_OB1_TIMESTAMPS - sendreq->t_send1 = get_profiler_timestamp(); -#endif MCA_PML_OB1_SEND_REQUEST_ADVANCE(sendreq); break; } @@ -89,32 +87,21 @@ void mca_pml_ob1_recv_frag_callback( { mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*) hdr->hdr_frag.hdr_dst_req.pval; - mca_pml_ob1_recv_request_progress(recvreq,segments,des->des_dst_cnt); + mca_pml_ob1_recv_request_progress(recvreq,btl,segments,des->des_dst_cnt); break; } case MCA_PML_OB1_HDR_TYPE_PUT: { mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*) - hdr->hdr_rdma.hdr_src.pval; + hdr->hdr_rdma.hdr_req.pval; mca_pml_ob1_send_request_put(sendreq,btl,&hdr->hdr_rdma); break; } case MCA_PML_OB1_HDR_TYPE_FIN: { - mca_btl_base_descriptor_t* dst = (mca_btl_base_descriptor_t*) - hdr->hdr_fin.hdr_dst.pval; - mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)dst->des_cbdata; -#if MCA_PML_OB1_TIMESTAMPS - recvreq->fin1[recvreq->fin_index] = get_profiler_timestamp(); - btl->btl_free(btl,dst); - recvreq->fin2[recvreq->fin_index] = get_profiler_timestamp(); - recvreq->fin_index++; - - mca_pml_ob1_recv_request_progress(recvreq,segments,des->des_dst_cnt); -#else - mca_pml_ob1_recv_request_progress(recvreq,segments,des->des_dst_cnt); - btl->btl_free(btl,dst); -#endif + mca_btl_base_descriptor_t* rdma = (mca_btl_base_descriptor_t*) + hdr->hdr_fin.hdr_des.pval; + rdma->des_cbfunc(btl, NULL, rdma, OMPI_SUCCESS); break; } default: @@ -426,11 +413,11 @@ int mca_pml_ob1_recv_frag_match( int rc; /* communicator pointer */ - comm_ptr=ompi_comm_lookup(hdr->hdr_contextid); + comm_ptr=ompi_comm_lookup(hdr->hdr_ctx); comm=(mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm; /* source sequence number */ - frag_msg_seq = hdr->hdr_msg_seq; + frag_msg_seq = hdr->hdr_seq; proc = comm->procs + hdr->hdr_src; /* get next expected message sequence number - if threaded @@ -497,7 +484,7 @@ int mca_pml_ob1_recv_frag_match( OPAL_THREAD_UNLOCK(&comm->matching_lock); return rc; } - MCA_PML_OB1_RECV_FRAG_INIT(frag,hdr,segments,num_segments); + MCA_PML_OB1_RECV_FRAG_INIT(frag,hdr,segments,num_segments,btl); opal_list_append( &proc->unexpected_frags, (opal_list_item_t *)frag ); } @@ -522,7 +509,7 @@ int mca_pml_ob1_recv_frag_match( OPAL_THREAD_UNLOCK(&comm->matching_lock); return rc; } - MCA_PML_OB1_RECV_FRAG_INIT(frag,hdr,segments,num_segments); + MCA_PML_OB1_RECV_FRAG_INIT(frag,hdr,segments,num_segments,btl); opal_list_append(&proc->frags_cant_match, (opal_list_item_t *)frag); } @@ -531,15 +518,13 @@ int mca_pml_ob1_recv_frag_match( /* release matching lock before processing fragment */ if(match != NULL) { - MCA_PML_OB1_RECV_REQUEST_MATCHED(match, hdr); - mca_pml_ob1_recv_request_progress(match,segments,num_segments); + mca_pml_ob1_recv_request_progress(match,btl,segments,num_segments); } if(additional_match) { opal_list_item_t* item; while(NULL != (item = opal_list_remove_first(&additional_matches))) { mca_pml_ob1_recv_frag_t* frag = (mca_pml_ob1_recv_frag_t*)item; - MCA_PML_OB1_RECV_REQUEST_MATCHED(frag->request, hdr); - mca_pml_ob1_recv_request_progress(frag->request,frag->segments,frag->num_segments); + mca_pml_ob1_recv_request_progress(frag->request,frag->btl,frag->segments,frag->num_segments); MCA_PML_OB1_RECV_FRAG_RETURN(frag); } } @@ -601,7 +586,7 @@ static bool mca_pml_ob1_check_cantmatch_for_match( /* * If the message has the next expected seq from that proc... */ - frag_seq=frag->hdr.hdr_match.hdr_msg_seq; + frag_seq=frag->hdr.hdr_match.hdr_seq; if (frag_seq == next_msg_seq_expected) { mca_pml_ob1_match_hdr_t* hdr = &frag->hdr.hdr_match; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h index 43cc7a98de..f04fd20ed4 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h @@ -38,6 +38,7 @@ struct mca_pml_ob1_recv_frag_t { mca_pml_ob1_hdr_t hdr; struct mca_pml_ob1_recv_request_t* request; size_t num_segments; + mca_btl_base_module_t* btl; mca_btl_base_segment_t segments[MCA_BTL_DES_MAX_SEGMENTS]; mca_pml_ob1_buffer_t* buffers[MCA_BTL_DES_MAX_SEGMENTS]; }; @@ -54,13 +55,14 @@ do { \ } while(0) -#define MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr,segs,cnt) \ +#define MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr,segs,cnt,btl) \ do { \ size_t i; \ mca_btl_base_segment_t* macro_segments = frag->segments; \ mca_pml_ob1_buffer_t** buffers = frag->buffers; \ \ /* init recv_frag */ \ + frag->btl = btl; \ frag->hdr = *(mca_pml_ob1_hdr_t*)hdr; \ frag->num_segments = cnt; \ /* copy over data */ \ diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 9d0a0a3e39..7e89d39911 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -24,7 +24,9 @@ #include "pml_ob1_recvreq.h" #include "pml_ob1_recvfrag.h" #include "pml_ob1_sendreq.h" +#include "pml_ob1_rdmafrag.h" #include "mca/bml/base/base.h" +#include "mca/errmgr/errmgr.h" static mca_pml_ob1_recv_frag_t* mca_pml_ob1_recv_request_match_specific_proc( mca_pml_ob1_recv_request_t* request, mca_pml_ob1_comm_proc_t* proc); @@ -102,16 +104,31 @@ OBJ_CLASS_INSTANCE( * Release resources. */ -static void mca_pml_ob1_send_ctl_complete( - mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* des, - int status) +static void mca_pml_ob1_ctl_completion( + mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status) { mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; MCA_BML_BASE_BTL_DES_RETURN(bml_btl, des); } +/* + * Put operation has completed remotely - update request status + */ + +static void mca_pml_ob1_put_completion( + mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status) +{ + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; + mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata; + mca_pml_ob1_recv_request_progress(recvreq,btl,des->des_dst,des->des_dst_cnt); + MCA_BML_BASE_BTL_DES_RETURN(bml_btl, des); +} /* * @@ -143,7 +160,7 @@ static void mca_pml_ob1_recv_request_ack( * registered. if registered on both sides - do one rdma for * the entire message. */ - if(hdr->hdr_match.hdr_msg_length > 0) { + if(hdr->hdr_msg_length > 0) { recvreq->req_chunk = mca_mpool_base_find(recvreq->req_recv.req_base.req_addr); if( NULL != recvreq->req_chunk && @@ -170,8 +187,8 @@ static void mca_pml_ob1_recv_request_ack( /* use convertor to figure out the rdma offset for this request */ recvreq->req_rdma_offset = bml_endpoint->btl_rdma_offset; - if(recvreq->req_rdma_offset < hdr->hdr_frag_length) { - recvreq->req_rdma_offset = hdr->hdr_frag_length; + if(recvreq->req_rdma_offset < recvreq->req_bytes_received) { + recvreq->req_rdma_offset = recvreq->req_bytes_received; } ompi_convertor_set_position( &recvreq->req_recv.req_convertor, @@ -182,7 +199,7 @@ static void mca_pml_ob1_recv_request_ack( /* start rdma at the current fragment offset - no need to send an ack in this case */ } else { - recvreq->req_rdma_offset = hdr->hdr_frag_length; + recvreq->req_rdma_offset = recvreq->req_bytes_received; return; } @@ -207,8 +224,7 @@ static void mca_pml_ob1_recv_request_ack( /* initialize descriptor */ des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; - des->des_cbfunc = mca_pml_ob1_send_ctl_complete; - des->des_cbdata = bml_btl; + des->des_cbfunc = mca_pml_ob1_ctl_completion; rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML); if(rc != OMPI_SUCCESS) { @@ -226,6 +242,166 @@ retry: opal_list_append(&mca_pml_ob1.acks_pending, (opal_list_item_t*)frag); } + +/** + * Return resources used by the RDMA + */ + +static void mca_pml_ob1_fin_completion( + mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status) +{ + + mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); + MCA_BML_BASE_BTL_DES_RETURN(bml_btl, des); +} + + +/* + * + */ + +static void mca_pml_ob1_rget_completion( + mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status) +{ + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; + mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; + mca_pml_ob1_recv_request_t* recvreq = frag->rdma_req; + mca_pml_ob1_fin_hdr_t* hdr; + mca_btl_base_descriptor_t *fin; + int rc; + + /* is receive request complete */ + OPAL_THREAD_LOCK(&ompi_request_lock); + recvreq->req_bytes_received += frag->rdma_length; + recvreq->req_bytes_delivered += frag->rdma_length; + if(recvreq->req_bytes_received == recvreq->req_recv.req_bytes_packed) { + recvreq->req_recv.req_base.req_ompi.req_status._count = recvreq->req_bytes_delivered; + recvreq->req_recv.req_base.req_pml_complete = true; + recvreq->req_recv.req_base.req_ompi.req_complete = true; + if(ompi_request_waiting) { + opal_condition_broadcast(&ompi_request_cond); + } + } + OPAL_THREAD_UNLOCK(&ompi_request_lock); + + /* return descriptor */ + MCA_BML_BASE_BTL_DES_RETURN(bml_btl, des); + + /* queue up a fin control message to source */ + MCA_BML_BASE_BTL_DES_ALLOC(bml_btl, fin, sizeof(mca_pml_ob1_fin_hdr_t)); + if(NULL == fin) { + opal_output(0, "[%s:%d] unable to allocate descriptor", __FILE__,__LINE__); + orte_errmgr.abort(); + } + fin->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; + fin->des_cbfunc = mca_pml_ob1_fin_completion; + fin->des_cbdata = frag; + + /* fill in header */ + hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_src->seg_addr.pval; + hdr->hdr_common.hdr_flags = 0; + hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; + hdr->hdr_des = frag->rdma_hdr.hdr_rget.hdr_des; + + /* queue request */ + rc = mca_bml_base_send( + bml_btl, + fin, + MCA_BTL_TAG_PML + ); + if(OMPI_SUCCESS != rc) { + opal_output(0, "[%s:%d] unable to queue fin", __FILE__,__LINE__); + orte_errmgr.abort(); + } +} + + +/* + * + */ + +static void mca_pml_ob1_recv_request_rget( + mca_pml_ob1_recv_request_t* recvreq, + mca_btl_base_module_t* btl, + mca_pml_ob1_rget_hdr_t* hdr) +{ + mca_bml_base_endpoint_t* bml_endpoint = NULL; + mca_bml_base_btl_t* bml_btl; + mca_pml_ob1_rdma_frag_t* frag; + mca_btl_base_descriptor_t* descriptor; + mca_mpool_base_registration_t* registration = NULL; + size_t i, size = 0; + int rc; + + /* lookup bml datastructures */ + bml_endpoint = (mca_bml_base_endpoint_t*) recvreq->req_proc->proc_pml; + bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, btl); + if(NULL == bml_btl) { + opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__); + orte_errmgr.abort(); + } + + /* allocate/initialize a fragment */ + MCA_PML_OB1_RDMA_FRAG_ALLOC(frag,rc); + for(i=0; ihdr_seg_cnt; i++) { + size += hdr->hdr_segs[i].seg_len; + frag->rdma_segs[i] = hdr->hdr_segs[i]; + } + frag->rdma_hdr.hdr_rget = *hdr; + frag->rdma_req = recvreq; + frag->rdma_ep = bml_endpoint; + frag->rdma_state = MCA_PML_OB1_RDMA_PREPARE; + + /* is there an existing registration for this btl */ + recvreq->req_chunk = mca_mpool_base_find(recvreq->req_recv.req_base.req_addr); + if( NULL != recvreq->req_chunk ) { + struct mca_mpool_base_reg_mpool_t *reg = recvreq->req_chunk->mpools; + while(reg->mpool != NULL) { + if(NULL != mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, + (mca_btl_base_module_t*) reg->user_data)) { + recvreq->req_mpool = reg; + registration = reg->mpool_registration; + break; + } + reg++; + } + } + + /* prepare descriptor */ + mca_bml_base_prepare_dst( + bml_btl, + registration, + &recvreq->req_recv.req_convertor, + 0, + &size, + &descriptor); + if(NULL == descriptor) { + opal_output(0, "[%s:%d] unable to allocate descriptor for rdma get", __FILE__, __LINE__); + orte_errmgr.abort(); + } + + frag->rdma_length = size; + descriptor->des_src = frag->rdma_segs; + descriptor->des_src_cnt = hdr->hdr_seg_cnt; + descriptor->des_cbdata = frag; + descriptor->des_cbfunc = mca_pml_ob1_rget_completion; + + /* queue up get request */ + rc = mca_bml_base_get(bml_btl,descriptor); + if(rc != OMPI_SUCCESS) { + opal_output(0, "[%s:%d] rdma get failed with error %d", __FILE__, __LINE__, rc); + orte_errmgr.abort(); + } +} + /* * Update the recv request status to reflect the number of bytes @@ -234,6 +410,7 @@ retry: void mca_pml_ob1_recv_request_progress( mca_pml_ob1_recv_request_t* recvreq, + mca_btl_base_module_t* btl, mca_btl_base_segment_t* segments, size_t num_segments) { @@ -242,11 +419,17 @@ void mca_pml_ob1_recv_request_progress( size_t data_offset = 0; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; bool schedule = false; + size_t i; + + for(i=0; ihdr_common.hdr_type) { case MCA_PML_OB1_HDR_TYPE_MATCH: - bytes_received = hdr->hdr_match.hdr_msg_length; + bytes_received -= sizeof(mca_pml_ob1_match_hdr_t); + recvreq->req_recv.req_bytes_packed = bytes_received; + MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq,&hdr->hdr_match); MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq, segments, @@ -259,12 +442,11 @@ void mca_pml_ob1_recv_request_progress( case MCA_PML_OB1_HDR_TYPE_RNDV: -#if MCA_PML_OB1_TIMESTAMPS - recvreq->ack = get_profiler_timestamp(); -#endif + bytes_received -= sizeof(mca_pml_ob1_rendezvous_hdr_t); + recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; recvreq->req_send = hdr->hdr_rndv.hdr_src_req; + MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq,&hdr->hdr_match); mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv); - bytes_received = hdr->hdr_rndv.hdr_frag_length; MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq, segments, @@ -275,9 +457,16 @@ void mca_pml_ob1_recv_request_progress( bytes_delivered); break; + case MCA_PML_OB1_HDR_TYPE_RGET: + + recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; + MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq,&hdr->hdr_match); + mca_pml_ob1_recv_request_rget(recvreq, btl, &hdr->hdr_rget); + return; + case MCA_PML_OB1_HDR_TYPE_FRAG: - bytes_received = hdr->hdr_frag.hdr_frag_length; + bytes_received -= sizeof(mca_pml_ob1_frag_hdr_t); data_offset = hdr->hdr_frag.hdr_frag_offset; MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq, @@ -291,7 +480,6 @@ void mca_pml_ob1_recv_request_progress( case MCA_PML_OB1_HDR_TYPE_FIN: - bytes_delivered = bytes_received = hdr->hdr_fin.hdr_rdma_length; OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); break; @@ -310,21 +498,6 @@ void mca_pml_ob1_recv_request_progress( recvreq->req_recv.req_base.req_pml_complete = true; recvreq->req_recv.req_base.req_ompi.req_complete = true; -#if MCA_PML_OB1_TIMESTAMPS - if(recvreq->req_bytes_received > 0) { - int i; - opal_output(0, "[%d,%d,%d] dst ack: %llu", - ORTE_NAME_ARGS(orte_process_info.my_name), recvreq->ack); - for(i=0; ipin_index; i++) { - opal_output(0, "[%d,%d,%d] dst pin, %llu %llu", - ORTE_NAME_ARGS(orte_process_info.my_name), recvreq->pin1[i], recvreq->pin2[i] - recvreq->pin1[i]); - } - for(i=0; ifin_index; i++) { - opal_output(0, "[%d,%d,%d] dst fin: %llu %llu", - ORTE_NAME_ARGS(orte_process_info.my_name), recvreq->fin1[i], recvreq->fin2[i] - recvreq->fin1[i]); - } - } -#endif if(ompi_request_waiting) { opal_condition_broadcast(&ompi_request_cond); } @@ -409,9 +582,6 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq) /* prepare a descriptor for RDMA */ ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset); -#if MCA_PML_OB1_TIMESTAMPS - recvreq->pin1[recvreq->pin_index] = get_profiler_timestamp(); -#endif mca_bml_base_prepare_dst( bml_btl, reg, @@ -419,16 +589,13 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq) 0, &size, &dst); -#if MCA_PML_OB1_TIMESTAMPS - recvreq->pin2[recvreq->pin_index] = get_profiler_timestamp(); - recvreq->pin_index++; -#endif if(dst == NULL) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); opal_list_append(&mca_pml_ob1.recv_pending, (opal_list_item_t*)recvreq); OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); break; } + dst->des_cbfunc = mca_pml_ob1_put_completion; dst->des_cbdata = recvreq; /* prepare a descriptor for rdma control message */ @@ -446,15 +613,14 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq) break; } ctl->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; - ctl->des_cbfunc = mca_pml_ob1_send_ctl_complete; - ctl->des_cbdata = bml_btl; + ctl->des_cbfunc = mca_pml_ob1_ctl_completion; /* fill in rdma header */ hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_src->seg_addr.pval; hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; hdr->hdr_common.hdr_flags = 0; - hdr->hdr_src = recvreq->req_send; - hdr->hdr_dst.pval = dst; + hdr->hdr_req = recvreq->req_send; + hdr->hdr_des.pval = dst; hdr->hdr_rdma_offset = recvreq->req_rdma_offset; hdr->hdr_seg_cnt = dst->des_dst_cnt; memcpy(hdr->hdr_segs, dst->des_dst, dst->des_dst_cnt * sizeof(mca_btl_base_segment_t)); @@ -506,7 +672,7 @@ void mca_pml_ob1_recv_request_match_specific(mca_pml_ob1_recv_request_t* request (frag = mca_pml_ob1_recv_request_match_specific_proc(request, proc)) != NULL) { OPAL_THREAD_UNLOCK(&comm->matching_lock); - mca_pml_ob1_recv_request_progress(request,frag->segments,frag->num_segments); + mca_pml_ob1_recv_request_progress(request,frag->btl,frag->segments,frag->num_segments); if( !((MCA_PML_REQUEST_IPROBE == request->req_recv.req_base.req_type) || (MCA_PML_REQUEST_PROBE == request->req_recv.req_base.req_type)) ) { MCA_PML_OB1_RECV_FRAG_RETURN(frag); @@ -560,7 +726,7 @@ void mca_pml_ob1_recv_request_match_wild(mca_pml_ob1_recv_request_t* request) if ((frag = mca_pml_ob1_recv_request_match_specific_proc(request, proc)) != NULL) { OPAL_THREAD_UNLOCK(&comm->matching_lock); - mca_pml_ob1_recv_request_progress(request,frag->segments,frag->num_segments); + mca_pml_ob1_recv_request_progress(request,frag->btl,frag->segments,frag->num_segments); if( !((MCA_PML_REQUEST_IPROBE == request->req_recv.req_base.req_type) || (MCA_PML_REQUEST_PROBE == request->req_recv.req_base.req_type)) ) { MCA_PML_OB1_RECV_FRAG_RETURN(frag); @@ -620,7 +786,6 @@ static mca_pml_ob1_recv_frag_t* mca_pml_ob1_recv_request_match_specific_proc( } return NULL; find_fragment: - MCA_PML_OB1_RECV_REQUEST_MATCHED(request, hdr); if( !((MCA_PML_REQUEST_IPROBE == request->req_recv.req_base.req_type) || (MCA_PML_REQUEST_PROBE == request->req_recv.req_base.req_type)) ) { opal_list_remove_item(unexpected_frags, (opal_list_item_t*)frag); diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 91208dfd17..c33402d785 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -28,12 +28,6 @@ extern "C" { #endif -struct mca_pml_ob1_registration_t { - struct mca_pml_ob1_endpoint_t* endpoint; - struct mca_mpool_base_registration_t* registration; -}; -typedef struct mca_pml_ob1_registration_t mca_pml_ob1_registration_t; - struct mca_pml_ob1_recv_request_t { mca_pml_base_recv_request_t req_recv; @@ -46,18 +40,6 @@ struct mca_pml_ob1_recv_request_t { size_t req_bytes_received; size_t req_bytes_delivered; size_t req_rdma_offset; - mca_pml_ob1_registration_t req_reg[MCA_MPOOL_BASE_MAX_REG]; - size_t req_num_reg; - -#if MCA_PML_OB1_TIMESTAMPS - unsigned long long ack; - unsigned long long pin1[MCA_PML_OB1_NUM_TSTAMPS]; - unsigned long long pin2[MCA_PML_OB1_NUM_TSTAMPS]; - unsigned long long fin1[MCA_PML_OB1_NUM_TSTAMPS]; - unsigned long long fin2[MCA_PML_OB1_NUM_TSTAMPS]; - int pin_index; - int fin_index; -#endif }; typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t; @@ -154,13 +136,6 @@ void mca_pml_ob1_recv_request_match_specific(mca_pml_ob1_recv_request_t* request /** * Initialize diagnostic code for tracing rdma protocol timing */ -#if MCA_PML_OB1_TIMESTAMPS -#define MCA_PML_OB1_RECV_REQUEST_TSTAMPS_INIT(recvreq) \ - (request)->fin_index = 0; \ - (request)->pin_index = 0; -#else -#define MCA_PML_OB1_RECV_REQUEST_TSTAMPS_INIT(recvreq) -#endif /** * Start an initialized request. @@ -180,7 +155,6 @@ do { (request)->req_recv.req_base.req_pml_complete = false; \ (request)->req_recv.req_base.req_ompi.req_complete = false; \ (request)->req_recv.req_base.req_ompi.req_state = OMPI_REQUEST_ACTIVE; \ - MCA_PML_OB1_RECV_REQUEST_TSTAMPS_INIT(request); \ \ /* always set the req_status.MPI_TAG to ANY_TAG before starting the \ * request. This field is used if cancelled to find out if the request \ @@ -207,10 +181,8 @@ do { request, \ hdr) \ do { \ - (request)->req_recv.req_bytes_packed = (hdr)->hdr_msg_length; \ (request)->req_recv.req_base.req_ompi.req_status.MPI_TAG = (hdr)->hdr_tag; \ (request)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = (hdr)->hdr_src; \ - \ if((request)->req_recv.req_bytes_packed != 0) { \ ompi_proc_t *proc = \ ompi_comm_peer_lookup( \ @@ -280,6 +252,7 @@ do { void mca_pml_ob1_recv_request_progress( mca_pml_ob1_recv_request_t* req, + struct mca_btl_base_module_t* btl, mca_btl_base_segment_t* segments, size_t num_segments); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index ea013514f0..feab5b1184 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -121,13 +121,9 @@ static void mca_pml_ob1_rndv_completion( } /* count bytes of user data actually delivered */ + OPAL_THREAD_LOCK(&ompi_request_lock); MCA_PML_OB1_SEND_REQUEST_SET_BYTES_DELIVERED(sendreq,descriptor,sizeof(mca_pml_ob1_rendezvous_hdr_t)); - -#if MCA_PML_OB1_TIMESTAMPS - if(sendreq->req_pipeline_depth == 1) { - sendreq->t_send2 = get_profiler_timestamp(); - } -#endif + OPAL_THREAD_UNLOCK(&ompi_request_lock); /* return the descriptor */ mca_bml_base_free(bml_btl, descriptor); @@ -140,6 +136,46 @@ static void mca_pml_ob1_rndv_completion( } +/** + * Completion of a get request. + */ + +static void mca_pml_ob1_rget_completion( + mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status) +{ + mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; + + /* count bytes of user data actually delivered and check for request completion */ + OPAL_THREAD_LOCK(&ompi_request_lock); + MCA_PML_OB1_SEND_REQUEST_SET_BYTES_DELIVERED(sendreq,des,0); + if (sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) { + MCA_PML_OB1_SEND_REQUEST_COMPLETE(sendreq); + } + OPAL_THREAD_LOCK(&ompi_request_lock); + + /* release resources */ + btl->btl_free(btl,des); +} + + +/** + * Completion of a control message - return resources. + */ + +static void mca_pml_ob1_ctl_completion( + mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* descriptor, + int status) +{ + /* return the descriptor */ + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context; + mca_bml_base_free(bml_btl, descriptor); +} + /** * Completion of additional fragments of a large message - may need * to schedule additional fragments. @@ -162,20 +198,11 @@ static void mca_pml_ob1_frag_completion( orte_errmgr.abort(); } - /* count bytes of user data actually delivered */ - MCA_PML_OB1_SEND_REQUEST_SET_BYTES_DELIVERED(sendreq,descriptor,sizeof(mca_pml_ob1_frag_hdr_t)); - -#if MCA_PML_OB1_TIMESTAMPS - if(sendreq->req_pipeline_depth == 1) { - sendreq->t_send2 = get_profiler_timestamp(); - } -#endif - - /* return the descriptor */ - mca_bml_base_free(bml_btl, descriptor); - /* check for request completion */ OPAL_THREAD_LOCK(&ompi_request_lock); + + /* count bytes of user data actually delivered */ + MCA_PML_OB1_SEND_REQUEST_SET_BYTES_DELIVERED(sendreq,descriptor,sizeof(mca_pml_ob1_frag_hdr_t)); if (OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1) == 0 && sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) { MCA_PML_OB1_SEND_REQUEST_COMPLETE(sendreq); @@ -188,6 +215,9 @@ static void mca_pml_ob1_frag_completion( mca_pml_ob1_send_request_schedule(sendreq); } + /* return the descriptor */ + mca_bml_base_free(bml_btl, descriptor); + /* check for pending requests */ MCA_PML_OB1_SEND_REQUEST_PROCESS_PENDING(); } @@ -250,12 +280,11 @@ int mca_pml_ob1_send_request_start_copy( hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr->hdr_common.hdr_flags = 0; hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - hdr->hdr_match.hdr_contextid = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; hdr->hdr_match.hdr_dst = sendreq->req_send.req_base.req_peer; hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_match.hdr_msg_seq = sendreq->req_send.req_base.req_sequence; + hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; /* update lengths */ segment->seg_len = sizeof(mca_pml_ob1_match_hdr_t) + max_data; @@ -270,78 +299,131 @@ int mca_pml_ob1_send_request_start_copy( /* rendezvous header is required */ } else { - int32_t free_after; - - /* allocate space for hdr + first fragment */ - mca_bml_base_alloc(bml_btl, &descriptor, sizeof(mca_pml_ob1_rendezvous_hdr_t) + size); - if(NULL == descriptor) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - segment = descriptor->des_src; + struct mca_mpool_base_registration_t* registration = NULL; + struct mca_bml_base_btl_t* reg_btl = NULL; + bool do_rdma; /* check to see if memory is registered */ sendreq->req_chunk = mca_mpool_base_find(sendreq->req_send.req_addr); + if(NULL != sendreq->req_chunk) { + struct mca_mpool_base_reg_mpool_t *reg = sendreq->req_chunk->mpools; + while(reg->mpool != NULL) { + if(NULL != (reg_btl = mca_bml_base_btl_array_find(&sendreq->bml_endpoint->btl_rdma, + (mca_btl_base_module_t*) reg->user_data))) { + registration = reg->mpool_registration; + break; + } + reg++; + } + } + /* if the data is already registed or leave_pinned is set - then we + * will attempt to do an rdma of the entire message. + */ + do_rdma = (reg_btl != NULL || mca_pml_ob1.leave_pinned) && + ompi_convertor_need_buffers(&sendreq->req_send.req_convertor) == false; + if(do_rdma == false || (reg_btl != NULL && (reg_btl->btl_flags & MCA_BTL_FLAGS_GET) == 0)) { + int32_t free_after; - /* if the buffer is not pinned and leave pinned is false we eagerly send - data to cover the cost of pinning the recv buffers on the peer */ - if(size && NULL == sendreq->req_chunk && !mca_pml_ob1.leave_pinned) { - - /* pack the data into the supplied buffer */ - iov.iov_base = (void*)((unsigned char*)segment->seg_addr.pval + + /* allocate space for hdr + first fragment */ + mca_bml_base_alloc(bml_btl, &descriptor, sizeof(mca_pml_ob1_rendezvous_hdr_t) + size); + if(NULL == descriptor) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = descriptor->des_src; + + /* rdma put is supported rather than rdma get */ + if(do_rdma) { + max_data = 0; /* dont eager send any data */ + } else { + iov.iov_base = (void*)((unsigned char*)segment->seg_addr.pval + sizeof(mca_pml_ob1_rendezvous_hdr_t)); - iov.iov_len = size; - iov_count = 1; - max_data = size; - if((rc = ompi_convertor_pack( - &sendreq->req_send.req_convertor, - &iov, - &iov_count, - &max_data, - &free_after)) < 0) { - mca_bml_base_free(bml_btl , descriptor); - return rc; + iov.iov_len = size; + iov_count = 1; + max_data = size; + if((rc = ompi_convertor_pack( + &sendreq->req_send.req_convertor, + &iov, + &iov_count, + &max_data, + &free_after)) < 0) { + mca_bml_base_free(bml_btl , descriptor); + return rc; + } } - if(max_data != size) { - opal_output(0, "[%s:%d] max_data (%lu) != size (%lu)\n", __FILE__,__LINE__,max_data,size); + + /* build hdr */ + hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = (sendreq->req_chunk != NULL ? MCA_PML_OB1_HDR_FLAGS_PIN : 0); + hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_dst = sendreq->req_send.req_base.req_peer; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; + + /* update lengths with number of bytes actually packed */ + segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data; + sendreq->req_send_offset = max_data; + + /* first fragment of a long message */ + descriptor->des_cbfunc = mca_pml_ob1_rndv_completion; + + /* the buffer is already pinned */ + } else { + mca_btl_base_descriptor_t* src; + size_t i; + + /* prepare descriptor */ + if(reg_btl != NULL) { + bml_btl = reg_btl; } - } - /* if the buffer is pinned or leave pinned is true we do not eagerly send - any data */ - else { - max_data = 0; - } - /* build hdr */ - hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; - hdr->hdr_common.hdr_flags = (sendreq->req_chunk != NULL ? MCA_PML_OB1_HDR_FLAGS_PIN : 0); - hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; - hdr->hdr_match.hdr_contextid = sendreq->req_send.req_base.req_comm->c_contextid; - hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; - hdr->hdr_match.hdr_dst = sendreq->req_send.req_base.req_peer; - hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_match.hdr_msg_seq = sendreq->req_send.req_base.req_sequence; - hdr->hdr_rndv.hdr_src_req.lval = 0; /* for VALGRIND/PURIFY - REPLACE WITH MACRO */ - hdr->hdr_rndv.hdr_src_req.pval = sendreq; - hdr->hdr_rndv.hdr_frag_length = max_data; + mca_bml_base_prepare_src( + bml_btl, + registration, + &sendreq->req_send.req_convertor, + 0, + &sendreq->req_send.req_bytes_packed, + &src); + if(NULL == src) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + src->des_cbfunc = mca_pml_ob1_rget_completion; + src->des_cbdata = sendreq; - /* update lengths with number of bytes actually packed */ - segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data; - sendreq->req_send_offset = max_data; + /* allocate space for hdr + segment list */ + mca_bml_base_alloc(bml_btl, &descriptor, + sizeof(mca_pml_ob1_rget_hdr_t) + (sizeof(mca_btl_base_segment_t)*(src->des_src_cnt-1))); + if(NULL == descriptor) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + segment = descriptor->des_src; - /* first fragment of a long message */ - descriptor->des_cbfunc = mca_pml_ob1_rndv_completion; + /* build match header */ + hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; + hdr->hdr_common.hdr_flags = (sendreq->req_chunk != NULL ? MCA_PML_OB1_HDR_FLAGS_PIN : 0); + hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + hdr->hdr_match.hdr_dst = sendreq->req_send.req_base.req_peer; + hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; + hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; + hdr->hdr_rndv.hdr_src_req.pval = sendreq; + hdr->hdr_rget.hdr_des.pval = src; + hdr->hdr_rget.hdr_seg_cnt = src->des_src_cnt; + for(i=0; ides_src_cnt; i++) + hdr->hdr_rget.hdr_segs[i] = src->des_src[i]; + descriptor->des_cbfunc = mca_pml_ob1_ctl_completion; + } } descriptor->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; descriptor->des_cbdata = sendreq; /* send */ -#if MCA_PML_OB1_TIMESTAMPS - sendreq->t_start = get_profiler_timestamp(); -#endif - rc = mca_bml_base_send(bml_btl, - descriptor, - MCA_BTL_TAG_PML); + rc = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_PML); if(OMPI_SUCCESS != rc) { mca_bml_base_free(bml_btl, descriptor ); } @@ -392,12 +474,11 @@ int mca_pml_ob1_send_request_start_prepare( hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr->hdr_common.hdr_flags = 0; hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; - hdr->hdr_match.hdr_contextid = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; hdr->hdr_match.hdr_dst = sendreq->req_send.req_base.req_peer; hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_match.hdr_msg_seq = sendreq->req_send.req_base.req_sequence; + hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; /* short message */ descriptor->des_cbfunc = mca_pml_ob1_match_completion; @@ -446,15 +527,13 @@ int mca_pml_ob1_send_request_start_prepare( hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr->hdr_common.hdr_flags = (sendreq->req_chunk != NULL ? MCA_PML_OB1_HDR_FLAGS_PIN : 0); hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; - hdr->hdr_match.hdr_contextid = sendreq->req_send.req_base.req_comm->c_contextid; + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; hdr->hdr_match.hdr_dst = sendreq->req_send.req_base.req_peer; hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; - hdr->hdr_match.hdr_msg_length = sendreq->req_send.req_bytes_packed; - hdr->hdr_match.hdr_msg_seq = sendreq->req_send.req_base.req_sequence; - hdr->hdr_rndv.hdr_src_req.lval = 0; /* for VALGRIND/PURIFY - REPLACE WITH MACRO */ + hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; + hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_rndv.hdr_src_req.pval = sendreq; - hdr->hdr_rndv.hdr_frag_length = size; /* first fragment of a long message */ descriptor->des_cbfunc = mca_pml_ob1_rndv_completion; @@ -464,12 +543,10 @@ int mca_pml_ob1_send_request_start_prepare( descriptor->des_cbdata = sendreq; /* send */ -#if MCA_PML_OB1_TIMESTAMPS - sendreq->t_start = get_profiler_timestamp(); -#endif - rc = mca_bml_base_send(bml_btl, - descriptor, - MCA_BTL_TAG_PML); + rc = mca_bml_base_send( + bml_btl, + descriptor, + MCA_BTL_TAG_PML); if(OMPI_SUCCESS != rc) { mca_bml_base_free(bml_btl, descriptor ); } @@ -552,7 +629,6 @@ int mca_pml_ob1_send_request_schedule(mca_pml_ob1_send_request_t* sendreq) hdr = (mca_pml_ob1_frag_hdr_t*)des->des_src->seg_addr.pval; hdr->hdr_common.hdr_flags = 0; hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG; - hdr->hdr_frag_length = size; hdr->hdr_frag_offset = sendreq->req_send_offset; hdr->hdr_src_req.pval = sendreq; hdr->hdr_dst_req = sendreq->req_recv; @@ -575,10 +651,6 @@ int mca_pml_ob1_send_request_schedule(mca_pml_ob1_send_request_t* sendreq) OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); break; } -#if MCA_PML_OB1_TIMESTAMPS - if(bytes_remaining == 0) - sendreq->t_scheduled = get_profiler_timestamp(); -#endif mca_pml_ob1_progress(); } } while (OPAL_THREAD_ADD32(&sendreq->req_lock,-1) > 0); @@ -631,13 +703,6 @@ static void mca_pml_ob1_put_completion( orte_errmgr.abort(); } -#if MCA_PML_OB1_TIMESTAMPS - /* update statistics */ - sendreq->t_fin[sendreq->t_fin_index++] = get_profiler_timestamp(); - if(sendreq->t_fin_index >= MCA_PML_OB1_NUM_TSTAMPS) - sendreq->t_fin_index = 0; -#endif - /* check for request completion */ OPAL_THREAD_LOCK(&ompi_request_lock); sendreq->req_bytes_delivered += frag->rdma_length; @@ -667,10 +732,7 @@ static void mca_pml_ob1_put_completion( hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_src->seg_addr.pval; hdr->hdr_common.hdr_flags = 0; hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; - hdr->hdr_src = frag->rdma_hdr.hdr_rdma.hdr_src; - hdr->hdr_dst = frag->rdma_hdr.hdr_rdma.hdr_dst; - hdr->hdr_rdma_offset = frag->rdma_hdr.hdr_rdma.hdr_rdma_offset; - hdr->hdr_rdma_length = frag->rdma_length; + hdr->hdr_des = frag->rdma_hdr.hdr_rdma.hdr_des; /* queue request */ rc = mca_bml_base_send( @@ -752,12 +814,6 @@ void mca_pml_ob1_send_request_put( } } -#if MCA_PML_OB1_TIMESTAMPS - sendreq->t_pin[sendreq->t_pin_index++] = get_profiler_timestamp(); - if(sendreq->t_pin_index >= MCA_PML_OB1_NUM_TSTAMPS) - sendreq->t_pin_index = 0; -#endif - /* setup descriptor */ ompi_convertor_set_position(&sendreq->req_send.req_convertor, &offset); @@ -782,13 +838,6 @@ void mca_pml_ob1_send_request_put( des->des_cbfunc = mca_pml_ob1_put_completion; des->des_cbdata = frag; -#if MCA_PML_OB1_TIMESTAMPS - /* queue put */ - sendreq->t_put[sendreq->t_put_index++] = get_profiler_timestamp(); - if(sendreq->t_put_index >= MCA_PML_OB1_NUM_TSTAMPS) - sendreq->t_put_index = 0; -#endif - if(OMPI_SUCCESS != (rc = mca_bml_base_put(bml_btl, des))) { if(rc == OMPI_ERR_OUT_OF_RESOURCE) { OPAL_THREAD_LOCK(&mca_pml_ob1.lock); diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index 34c34844ef..648c2f6383 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -45,19 +45,6 @@ struct mca_pml_ob1_send_request_t { size_t req_bytes_delivered; size_t req_send_offset; size_t req_rdma_offset; - -#if MCA_PML_OB1_TIMESTAMPS - unsigned long long t_start; - unsigned long long t_send1; - unsigned long long t_send2; - unsigned long long t_scheduled; - unsigned long long t_pin[MCA_PML_OB1_NUM_TSTAMPS]; - unsigned long long t_put[MCA_PML_OB1_NUM_TSTAMPS]; - unsigned long long t_fin[MCA_PML_OB1_NUM_TSTAMPS]; - int t_pin_index; - int t_put_index; - int t_fin_index; -#endif }; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; @@ -113,47 +100,6 @@ OBJ_CLASS_DECLARATION(mca_pml_ob1_send_request_t); * Diagnostic output to trace rdma protocol timing */ -#if MCA_PML_OB1_TIMESTAMPS -#define MCA_PML_OB1_SEND_REQUEST_TSTAMPS_DUMP(sendreq) \ -{ \ - int i; \ - opal_output(0, "[%d,%d,%d] src start, %llu\n", \ - ORTE_NAME_ARGS(orte_process_info.my_name), (sendreq)->t_start); \ -\ - opal_output(0, "[%d,%d,%d] src send start, %llu\n", \ - ORTE_NAME_ARGS(orte_process_info.my_name), (sendreq)->t_send1); \ -\ - opal_output(0, "[%d,%d,%d] src scheduled, %llu\n", \ - ORTE_NAME_ARGS(orte_process_info.my_name), (sendreq)->t_scheduled); \ -\ - opal_output(0, "[%d,%d,%d] src send complete, %llu\n", \ - ORTE_NAME_ARGS(orte_process_info.my_name), (sendreq)->t_send2); \ -\ - for(i=0; i<(sendreq)->t_pin_index; i++) \ - opal_output(0, "[%d,%d,%d] src pin, %llu %llu\n", \ - ORTE_NAME_ARGS(orte_process_info.my_name), (sendreq)->t_pin[i], \ - (sendreq)->t_put[i] - (sendreq)->t_pin[i]); \ - for(i=0; i<(sendreq)->t_put_index; i++) \ - opal_output(0, "[%d,%d,%d] src put, %llu %llu\n", \ - ORTE_NAME_ARGS(orte_process_info.my_name), (sendreq)->t_put[i], \ - (sendreq)->t_fin[i] - (sendreq)->t_put[i]); \ - for(i=0; i<(sendreq)->t_fin_index; i++) \ - opal_output(0, "[%d,%d,%d] src fin, %llu\n", \ - ORTE_NAME_ARGS(orte_process_info.my_name), (sendreq)->t_fin[i]); \ -} - -#define MCA_PML_OB1_SEND_REQUEST_TSTAMPS_INIT(sendreq) \ -{ \ - sendreq->t_pin_index = 0; \ - sendreq->t_put_index = 0; \ - sendreq->t_fin_index = 0; \ -} - -#else -#define MCA_PML_OB1_SEND_REQUEST_TSTAMPS_DUMP(sendreq) -#define MCA_PML_OB1_SEND_REQUEST_TSTAMPS_INIT(sendreq) -#endif - /** * Start a send request. @@ -170,7 +116,6 @@ do { break; \ } \ \ - MCA_PML_OB1_SEND_REQUEST_TSTAMPS_INIT(sendreq); \ sendreq->req_lock = 0; \ sendreq->req_pipeline_depth = 0; \ sendreq->req_bytes_delivered = 0; \ @@ -205,12 +150,11 @@ do { hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; \ hdr->hdr_common.hdr_flags = 0; \ hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; \ - hdr->hdr_match.hdr_contextid = sendreq->req_send.req_base.req_comm->c_contextid; \ + hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; \ hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; \ hdr->hdr_match.hdr_dst = sendreq->req_send.req_base.req_peer; \ hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; \ - hdr->hdr_match.hdr_msg_length = 0; \ - hdr->hdr_match.hdr_msg_seq = sendreq->req_send.req_base.req_sequence; \ + hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; \ \ /* short message */ \ descriptor->des_cbfunc = mca_pml_ob1_match_completion; \ @@ -257,7 +201,6 @@ do { (sendreq)->req_send.req_base.req_ompi.req_status._count = \ (sendreq)->req_send.req_bytes_packed; \ (sendreq)->req_send.req_base.req_ompi.req_complete = true; \ - MCA_PML_OB1_SEND_REQUEST_TSTAMPS_DUMP(sendreq); \ if(ompi_request_waiting) { \ opal_condition_broadcast(&ompi_request_cond); \ } \