1
1

pml/ob1: update for BTL 3.0 interface

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2015-01-06 08:45:08 -07:00 коммит произвёл Nathan Hjelm
родитель 44fb8369ff
Коммит c4a0e02261
13 изменённых файлов: 715 добавлений и 749 удалений

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
@ -500,17 +500,17 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
case MCA_PML_OB1_HDR_TYPE_RGET:
type = "RGET";
snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64
"seg_cnt %d hdr_des %" PRIu64,
"frag %" PRIu64 " src_ptr %" PRIu64,
hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src,
hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq,
hdr->hdr_rndv.hdr_msg_length,
hdr->hdr_rget.hdr_seg_cnt, hdr->hdr_rget.hdr_des.lval);
hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval,
hdr->hdr_rget.hdr_src_ptr);
break;
case MCA_PML_OB1_HDR_TYPE_ACK:
type = "ACK";
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64,
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64,
hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval,
hdr->hdr_ack.hdr_send_offset);
hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size);
break;
case MCA_PML_OB1_HDR_TYPE_FRAG:
type = "FRAG";
@ -520,10 +520,11 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
break;
case MCA_PML_OB1_HDR_TYPE_PUT:
type = "PUT";
snprintf( header, 128, "seg_cnt %d dst_req %p src_des %p recv_req %p offset %" PRIu64 " [%p %" PRIu64 "]",
hdr->hdr_rdma.hdr_seg_cnt, hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_des.pval,
snprintf( header, 128, "dst_req %p src_frag %p recv_req %p offset %" PRIu64
" dst_ptr %" PRIu64 " dst_size %" PRIu64,
hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval,
hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset,
hdr->hdr_rdma.hdr_segs[0].seg_addr.pval, hdr->hdr_rdma.hdr_segs[0].seg_len);
hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size);
break;
case MCA_PML_OB1_HDR_TYPE_FIN:
type = "FIN";
@ -638,37 +639,32 @@ static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl,
*/
int mca_pml_ob1_send_fin( ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des,
opal_ptr_t hdr_frag,
uint64_t rdma_size,
uint8_t order,
uint32_t status )
int status )
{
mca_btl_base_descriptor_t* fin;
mca_pml_ob1_fin_hdr_t* hdr;
int rc;
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL);
if(NULL == fin) {
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE;
}
fin->des_cbfunc = mca_pml_ob1_fin_completion;
fin->des_cbdata = NULL;
/* fill in header */
hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_local->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des;
hdr->hdr_fail = status;
mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval,
0, hdr_frag.lval, status ? status : (int64_t) rdma_size);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc);
/* queue request */
rc = mca_bml_base_send( bml_btl,
fin,
MCA_PML_OB1_HDR_TYPE_FIN );
rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN );
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -676,7 +672,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, fin);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE;
}
@ -717,6 +713,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
pckt->hdr.hdr_ack.hdr_src_req.lval,
pckt->hdr.hdr_ack.hdr_dst_req.pval,
pckt->hdr.hdr_ack.hdr_send_offset,
pckt->hdr.hdr_ack.hdr_send_size,
pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
@ -728,9 +725,10 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
break;
case MCA_PML_OB1_HDR_TYPE_FIN:
rc = mca_pml_ob1_send_fin(pckt->proc, send_dst,
pckt->hdr.hdr_fin.hdr_des,
pckt->hdr.hdr_fin.hdr_frag,
pckt->hdr.hdr_fin.hdr_size,
pckt->order,
pckt->hdr.hdr_fin.hdr_fail);
pckt->status);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
return;
}

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -216,6 +216,7 @@ struct mca_pml_ob1_pckt_pending_t {
mca_pml_ob1_hdr_t hdr;
struct mca_bml_base_btl_t *bml_btl;
uint8_t order;
int status;
};
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
@ -234,17 +235,17 @@ do { \
(ompi_free_list_item_t*)pckt); \
} while(0)
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, Sz, B, O, S) \
do { \
mca_pml_ob1_pckt_pending_t *_pckt; \
\
MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \
_pckt->hdr.hdr_fin.hdr_des = (D); \
_pckt->hdr.hdr_fin.hdr_fail = (S); \
mca_pml_ob1_fin_hdr_prepare (&_pckt->hdr.hdr_fin, 0, \
(D).lval, (Sz)); \
_pckt->proc = (P); \
_pckt->bml_btl = (B); \
_pckt->order = (O); \
_pckt->status = (S); \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
opal_list_append(&mca_pml_ob1.pckt_pending, \
(opal_list_item_t*)_pckt); \
@ -253,7 +254,7 @@ do { \
int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des, uint8_t order, uint32_t status);
opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status);
/* This function tries to resend FIN/ACK packets from pckt_pending queue.
* Packets are added to the queue when sending of FIN or ACK is failed due to
@ -283,20 +284,6 @@ void mca_pml_ob1_process_pending_rdma(void);
/*
* Compute the total number of bytes on supplied descriptor
*/
static inline size_t
mca_pml_ob1_compute_segment_length(size_t seg_size, void *segments,
size_t count, size_t hdrlen)
{
size_t i, length = 0;
mca_btl_base_segment_t *segment = (mca_btl_base_segment_t*)segments;
for (i = 0; i < count ; ++i) {
length += segment->seg_len;
segment = (mca_btl_base_segment_t *)((char *)segment + seg_size);
}
return (length - hdrlen);
}
static inline size_t
mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments,
size_t count, size_t hdrlen)
@ -338,7 +325,7 @@ mca_pml_ob1_compute_segment_length_remote (size_t seg_size, void *segments,
/* represent BTL chosen for sending request */
struct mca_pml_ob1_com_btl_t {
mca_bml_base_btl_t *bml_btl;
struct mca_mpool_base_registration_t* btl_reg;
struct mca_btl_base_registration_handle_t *btl_reg;
size_t length;
};
typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -12,6 +13,8 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -123,19 +126,20 @@ size_t mca_pml_ob1_rdma_cuda_btls(
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
mca_mpool_base_registration_t* reg = NULL;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
mca_btl_base_registration_handle_t *handle = NULL;
if( NULL != btl_mpool ) {
if( NULL != bml_btl->btl->btl_register_mem ) {
/* register the memory */
btl_mpool->mpool_register(btl_mpool, base, size, MCA_MPOOL_FLAGS_CUDA_GPU_MEM, &reg);
handle = bml_btl->btl->btl_register_mem (bml_btl->btl, bml_btl->btl_endpoint,
base, size, MCA_BTL_REG_FLAG_CUDA_GPU_MEM |
MCA_BTL_REG_FLAG_REMOTE_READ);
}
if(NULL == reg)
if(NULL == handle)
continue;
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg;
rdma_btls[num_btls_used].btl_reg = handle;
weight_total += bml_btl->btl_weight;
num_btls_used++;
}

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -64,6 +64,13 @@ struct mca_pml_ob1_common_hdr_t {
};
typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t;
static inline void mca_pml_ob1_common_hdr_prepare (mca_pml_ob1_common_hdr_t *hdr, uint8_t hdr_type,
uint8_t hdr_flags)
{
hdr->hdr_type = hdr_type;
hdr->hdr_flags = hdr_flags;
}
#define MCA_PML_OB1_COMMON_HDR_NTOH(h)
#define MCA_PML_OB1_COMMON_HDR_HTON(h)
@ -89,15 +96,19 @@ struct mca_pml_ob1_match_hdr_t {
typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t;
static inline void mca_pml_ob1_match_hdr_prepare (mca_pml_ob1_match_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, hdr_type, hdr_flags);
hdr->hdr_ctx = hdr_ctx;
hdr->hdr_src = hdr_src;
hdr->hdr_tag = hdr_tag;
hdr->hdr_seq = hdr_seq;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_MATCH_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while(0)
#else
#define MCA_PML_OB1_MATCH_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
#endif
}
#define MCA_PML_OB1_MATCH_HDR_NTOH(h) \
do { \
@ -111,7 +122,6 @@ do { \
#define MCA_PML_OB1_MATCH_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_MATCH_HDR_FILL(h); \
(h).hdr_ctx = htons((h).hdr_ctx); \
(h).hdr_src = htonl((h).hdr_src); \
(h).hdr_tag = htonl((h).hdr_tag); \
@ -130,12 +140,14 @@ struct mca_pml_ob1_rendezvous_hdr_t {
};
typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RNDV_HDR_FILL(h) \
MCA_PML_OB1_MATCH_HDR_FILL((h).hdr_match)
#else
#define MCA_PML_OB1_RNDV_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
uint64_t hdr_msg_length, void *hdr_src_req)
{
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, hdr_type, hdr_flags, hdr_ctx, hdr_src, hdr_tag, hdr_seq);
hdr->hdr_msg_length = hdr_msg_length;
hdr->hdr_src_req.pval = hdr_src_req;
}
/* Note that hdr_src_req is not put in network byte order because it
is never processed by the receiver, other than being copied into
@ -149,7 +161,6 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#define MCA_PML_OB1_RNDV_HDR_HTON(h) \
do { \
MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \
MCA_PML_OB1_RNDV_HDR_FILL(h); \
(h).hdr_msg_length = hton64((h).hdr_msg_length); \
} while (0)
@ -158,38 +169,47 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
*/
struct mca_pml_ob1_rget_hdr_t {
mca_pml_ob1_rendezvous_hdr_t hdr_rndv;
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[4];
#endif
opal_ptr_t hdr_des; /**< source descriptor */
opal_ptr_t hdr_frag; /**< source fragment (for fin) */
uint64_t hdr_src_ptr; /**< source pointer */
/* btl registration handle data follows */
};
typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t;
static inline void mca_pml_ob1_rget_hdr_prepare (mca_pml_ob1_rget_hdr_t *hdr, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
uint64_t hdr_msg_length, void *hdr_src_req, void *hdr_frag,
void *hdr_src_ptr, void *local_handle, size_t local_handle_size)
{
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RGET, hdr_flags,
hdr_ctx, hdr_src, hdr_tag, hdr_seq, hdr_msg_length, hdr_src_req);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RGET_HDR_FILL(h) \
do { \
MCA_PML_OB1_RNDV_HDR_FILL((h).hdr_rndv); \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
} while(0)
#else
#define MCA_PML_OB1_RGET_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
hdr->hdr_padding[2] = 0;
hdr->hdr_padding[3] = 0;
#endif
hdr->hdr_frag.pval = hdr_frag;
hdr->hdr_src_ptr = (uint64_t)(intptr_t) hdr_src_ptr;
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
do { \
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
/* copy registration handle */
memcpy (hdr + 1, local_handle, local_handle_size);
}
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
do { \
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_src_ptr = ntoh64((h).hdr_src_ptr); \
} while (0)
#define MCA_PML_OB1_RGET_HDR_HTON(h) \
do { \
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
MCA_PML_OB1_RGET_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
#define MCA_PML_OB1_RGET_HDR_HTON(h) \
do { \
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_src_ptr = hton64((h).hdr_src_ptr); \
} while (0)
/**
@ -206,19 +226,23 @@ struct mca_pml_ob1_frag_hdr_t {
};
typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t;
static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag_offset, void *hdr_src_req,
uint64_t hdr_dst_req)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FRAG, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FRAG_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
(h).hdr_padding[4] = 0; \
(h).hdr_padding[5] = 0; \
} while(0)
#else
#define MCA_PML_OB1_FRAG_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
hdr->hdr_padding[2] = 0;
hdr->hdr_padding[3] = 0;
hdr->hdr_padding[4] = 0;
hdr->hdr_padding[5] = 0;
#endif
hdr->hdr_frag_offset = hdr_frag_offset;
hdr->hdr_src_req.pval = hdr_src_req;
hdr->hdr_dst_req.lval = hdr_dst_req;
}
#define MCA_PML_OB1_FRAG_HDR_NTOH(h) \
do { \
@ -229,7 +253,6 @@ do { \
#define MCA_PML_OB1_FRAG_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FRAG_HDR_FILL(h); \
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
} while (0)
@ -245,38 +268,45 @@ struct mca_pml_ob1_ack_hdr_t {
opal_ptr_t hdr_src_req; /**< source request */
opal_ptr_t hdr_dst_req; /**< matched receive request */
uint64_t hdr_send_offset; /**< starting point of copy in/out */
uint64_t hdr_send_size; /**< number of bytes requested (0: all remaining) */
};
typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t;
static inline void mca_pml_ob1_ack_hdr_prepare (mca_pml_ob1_ack_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_send_offset, uint64_t hdr_send_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_ACK, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_ACK_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
(h).hdr_padding[2] = 0; \
(h).hdr_padding[3] = 0; \
(h).hdr_padding[4] = 0; \
(h).hdr_padding[5] = 0; \
} while (0)
#else
#define MCA_PML_OB1_ACK_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
hdr->hdr_padding[2] = 0;
hdr->hdr_padding[3] = 0;
hdr->hdr_padding[4] = 0;
hdr->hdr_padding[5] = 0;
#endif
hdr->hdr_src_req.lval = hdr_src_req;
hdr->hdr_dst_req.pval = hdr_dst_req;
hdr->hdr_send_offset = hdr_send_offset;
hdr->hdr_send_size = hdr_send_size;
}
/* Note that the request headers are not put in NBO because the
src_req is already in receiver's byte order and the dst_req is not
used by the receiver for anything other than backpointers in return
headers */
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
(h).hdr_send_size = ntoh64((h).hdr_send_size); \
} while (0)
#define MCA_PML_OB1_ACK_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_ACK_HDR_FILL(h); \
#define MCA_PML_OB1_ACK_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
(h).hdr_send_offset = hton64((h).hdr_send_offset); \
(h).hdr_send_size = hton64((h).hdr_send_size); \
} while (0)
/**
@ -288,38 +318,55 @@ struct mca_pml_ob1_rdma_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
#endif
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
/* TODO: add real support for multiple destination segments */
opal_ptr_t hdr_req; /**< destination request */
opal_ptr_t hdr_des; /**< source descriptor */
opal_ptr_t hdr_frag; /**< receiver fragment */
opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */
uint64_t hdr_rdma_offset; /**< current offset into user buffer */
uint64_t hdr_dst_ptr; /**< destination address */
uint64_t hdr_dst_size; /**< destination size */
/* registration data follows */
};
typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t;
static inline void mca_pml_ob1_rdma_hdr_prepare (mca_pml_ob1_rdma_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_req, void *hdr_frag, void *hdr_recv_req,
uint64_t hdr_rdma_offset, void *hdr_dst_ptr,
uint64_t hdr_dst_size, void *local_handle,
size_t local_handle_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_PUT, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RDMA_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while(0)
#else
#define MCA_PML_OB1_RDMA_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
#endif
hdr->hdr_req.lval = hdr_req;
hdr->hdr_frag.pval = hdr_frag;
hdr->hdr_recv_req.pval = hdr_recv_req;
hdr->hdr_rdma_offset = hdr_rdma_offset;
hdr->hdr_dst_ptr = (uint64_t)(intptr_t) hdr_dst_ptr;
hdr->hdr_dst_size = hdr_dst_size;
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
/* copy segments */
memcpy (hdr + 1, local_handle, local_handle_size);
}
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = ntoh64((h).hdr_dst_ptr); \
(h).hdr_dst_size = ntoh64((h).hdr_dst_size); \
} while (0)
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_RDMA_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = hton64((h).hdr_dst_ptr); \
(h).hdr_dst_size = hton64((h).hdr_dst_size); \
} while (0)
/**
@ -331,31 +378,34 @@ struct mca_pml_ob1_fin_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2];
#endif
uint32_t hdr_fail; /**< RDMA operation failed */
opal_ptr_t hdr_des; /**< completed descriptor */
int64_t hdr_size; /**< number of bytes completed (positive), error code (negative) */
opal_ptr_t hdr_frag; /**< completed RDMA fragment */
};
typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t;
static inline void mca_pml_ob1_fin_hdr_prepare (mca_pml_ob1_fin_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag, int64_t hdr_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FIN, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FIN_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while (0)
#else
#define MCA_PML_OB1_FIN_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
hdr->hdr_padding[0] = 0;
hdr->hdr_padding[1] = 0;
#endif
hdr->hdr_frag.lval = hdr_frag;
hdr->hdr_size = hdr_size;
}
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \
do { \
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_size = ntoh64((h).hdr_size); \
} while (0)
#define MCA_PML_OB1_FIN_HDR_HTON(h) \
do { \
#define MCA_PML_OB1_FIN_HDR_HTON(h) \
do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FIN_HDR_FILL(h); \
} while (0)
(h).hdr_size = hton64((h).hdr_size); \
} while (0)
/**
* Union of defined hdr types.

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
@ -68,7 +68,6 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint,
ompi_communicator_t * comm)
{
mca_btl_base_descriptor_t *des = NULL;
mca_pml_ob1_match_hdr_t match;
mca_bml_base_btl_t *bml_btl;
opal_convertor_t convertor;
@ -98,28 +97,21 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
size = 0;
}
match.hdr_common.hdr_flags = 0;
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
match.hdr_ctx = comm->c_contextid;
match.hdr_src = comm->c_my_rank;
match.hdr_tag = tag;
match.hdr_seq = seqn;
mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
comm->c_contextid, comm->c_my_rank,
tag, seqn);
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);
/* try to send immediately */
rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
MCA_PML_OB1_HDR_TYPE_MATCH, &des);
MCA_PML_OB1_HDR_TYPE_MATCH, NULL);
if (count > 0) {
opal_convertor_cleanup (&convertor);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
if (des) {
mca_bml_base_free (bml_btl, des);
}
return rc;
}
@ -224,7 +216,7 @@ int mca_pml_ob1_send(void *buf,
OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t);
sendreq->req_send.req_base.req_proc = dst_proc;
sendreq->src_des = NULL;
sendreq->rdma_frag = NULL;
MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
buf,

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -9,6 +10,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -27,11 +30,6 @@
#include "pml_ob1.h"
#include "pml_ob1_rdma.h"
/* Use this registration if no registration needed for a BTL instead of NULL.
* This will help other code to distinguish case when memory is not registered
* from case when registration is not needed */
static mca_mpool_base_registration_t pml_ob1_dummy_reg;
/*
* Check to see if memory is registered or can be registered. Build a
* set of registrations on the request.
@ -45,7 +43,7 @@ size_t mca_pml_ob1_rdma_btls(
{
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0;
int num_btls_used = 0, n;
int num_btls_used = 0;
/* shortcut when there are no rdma capable btls */
if(num_btls == 0) {
@ -53,29 +51,33 @@ size_t mca_pml_ob1_rdma_btls(
}
/* check to see if memory is registered */
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
n++) {
for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) {
mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
(bml_endpoint->btl_rdma_index + n) % num_btls);
mca_mpool_base_registration_t* reg = &pml_ob1_dummy_reg;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
(bml_endpoint->btl_rdma_index + n) % num_btls);
mca_btl_base_registration_handle_t *reg_handle = NULL;
mca_btl_base_module_t *btl = bml_btl->btl;
if( NULL != btl_mpool ) {
if(!mca_pml_ob1.leave_pinned) {
/* look through existing registrations */
btl_mpool->mpool_find(btl_mpool, base, size, &reg);
} else {
/* register the memory */
btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
if (btl->btl_register_mem) {
/* do not use the RDMA protocol with this btl if 1) leave pinned is disabled,
* 2) the btl supports put, and 3) the fragment is larger than the minimum
* pipeline size specified by the BTL */
if (!mca_pml_ob1.leave_pinned && (btl->btl_flags & MCA_BTL_FLAGS_PUT) &&
size > btl->btl_min_rdma_pipeline_size) {
continue;
}
if(NULL == reg)
/* try to register the memory region with the btl */
reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base,
size, MCA_BTL_REG_FLAG_REMOTE_READ);
if (NULL == reg_handle) {
/* btl requires registration but the registration failed */
continue;
}
}
} /* else no registration is needed with this btl */
rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg;
rdma_btls[num_btls_used].btl_reg = reg_handle;
weight_total += bml_btl->btl_weight;
num_btls_used++;
}
@ -83,7 +85,7 @@ size_t mca_pml_ob1_rdma_btls(
/* if we don't use leave_pinned and all BTLs that already have this memory
* registered amount to less then half of available bandwidth - fall back to
* pipeline protocol */
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
if (0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
return 0;
mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
@ -103,10 +105,7 @@ size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
rdma_btls[i].bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
rdma_btls[i].btl_reg = NULL;
else
rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
rdma_btls[i].btl_reg = NULL;
weight_total += rdma_btls[i].bml_btl->btl_weight;
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -9,6 +10,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -21,9 +24,13 @@
#include "pml_ob1.h"
#include "pml_ob1_rdmafrag.h"
static void mca_pml_ob1_rdma_frag_constructor (mca_pml_ob1_rdma_frag_t *frag)
{
frag->local_handle = NULL;
}
OBJ_CLASS_INSTANCE(
mca_pml_ob1_rdma_frag_t,
ompi_free_list_item_t,
NULL,
mca_pml_ob1_rdma_frag_constructor,
NULL);

Просмотреть файл

@ -10,6 +10,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,38 +34,52 @@ typedef enum {
MCA_PML_OB1_RDMA_GET
} mca_pml_ob1_rdma_state_t;
struct mca_pml_ob1_rdma_frag_t;
typedef void (*mca_pml_ob1_rdma_frag_callback_t)(struct mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length);
/**
* Used to keep track of local and remote RDMA operations.
*/
struct mca_pml_ob1_rdma_frag_t {
ompi_free_list_item_t super;
mca_bml_base_btl_t* rdma_bml;
mca_bml_base_btl_t *rdma_bml;
mca_pml_ob1_hdr_t rdma_hdr;
mca_pml_ob1_rdma_state_t rdma_state;
size_t rdma_length;
uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS];
void *rdma_req;
struct mca_bml_base_endpoint_t* rdma_ep;
opal_convertor_t convertor;
mca_mpool_base_registration_t* reg;
uint32_t retries;
mca_pml_ob1_rdma_frag_callback_t cbfunc;
uint64_t rdma_offset;
void *local_address;
mca_btl_base_registration_handle_t *local_handle;
uint64_t remote_address;
uint8_t remote_handle[MCA_BTL_REG_HANDLE_MAX_SIZE];
};
typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t);
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
do { \
ompi_free_list_item_t* item; \
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
do { \
ompi_free_list_item_t* item; \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \
frag = (mca_pml_ob1_rdma_frag_t*)item; \
} while(0)
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
(ompi_free_list_item_t*)frag); \
frag = (mca_pml_ob1_rdma_frag_t*)item; \
} while(0)
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
if (frag->local_handle) { \
mca_bml_base_deregister_mem (frag->rdma_bml, frag->local_handle); \
frag->local_handle = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
(ompi_free_list_item_t*)frag); \
} while (0)
END_C_DECLS

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -108,13 +108,13 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval;
ompi_communicator_t *comm_ptr;
mca_pml_ob1_recv_request_t *match = NULL;
mca_pml_ob1_comm_t *comm;
mca_pml_ob1_comm_proc_t *proc;
size_t num_segments = des->des_local_count;
size_t num_segments = des->des_segment_count;
size_t bytes_received = 0;
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
@ -256,7 +256,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
@ -264,7 +264,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
}
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV);
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RNDV);
des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RNDV);
return;
}
@ -273,7 +273,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
@ -281,7 +281,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
}
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET);
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RGET);
des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RGET);
return;
}
@ -292,9 +292,10 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des,
void* cbdata )
{
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_send_request_t* sendreq;
size_t size;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return;
@ -307,19 +308,25 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
/* if the request should be delivered entirely by copy in/out
* then throttle sends */
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
if (NULL != sendreq->src_des) {
/* release registered memory */
mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des);
sendreq->src_des = NULL;
if (NULL != sendreq->rdma_frag) {
if (NULL != sendreq->rdma_frag->local_handle) {
mca_bml_base_deregister_mem (sendreq->req_rdma[0].bml_btl, sendreq->rdma_frag->local_handle);
sendreq->rdma_frag->local_handle = NULL;
}
MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
sendreq->rdma_frag = NULL;
}
sendreq->req_throttle_sends = true;
}
mca_pml_ob1_send_request_copy_in_out(sendreq,
hdr->hdr_ack.hdr_send_offset,
sendreq->req_send.req_bytes_packed -
hdr->hdr_ack.hdr_send_offset);
if (hdr->hdr_ack.hdr_send_size) {
size = hdr->hdr_ack.hdr_send_size;
} else {
size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset;
}
mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size);
if (sendreq->req_state != 0) {
/* Typical receipt of an ACK message causes req_state to be
@ -355,13 +362,14 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_recv_request_t* recvreq;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return;
}
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
@ -372,7 +380,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
/* This will trigger the opal_convertor_pack to start asynchronous copy. */
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_local_count,des);
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_segment_count,des);
/* Let BTL know that it CANNOT free the frag */
des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
@ -380,7 +388,8 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
return;
}
#endif /* OPAL_CUDA_SUPPORT */
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
return;
}
@ -390,7 +399,7 @@ void mca_pml_ob1_recv_frag_callback_put(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_send_request_t* sendreq;
@ -410,20 +419,17 @@ void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_btl_base_descriptor_t* rdma;
mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval;
mca_pml_ob1_rdma_frag_t *frag;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) {
return;
}
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN);
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
rdma->des_cbfunc(btl, NULL, rdma,
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
return;
frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval;
frag->cbfunc (frag, hdr->hdr_size);
}
@ -699,7 +705,7 @@ out_of_order_match:
OPAL_THREAD_UNLOCK(&comm->matching_lock);
if(OPAL_LIKELY(match)) {
switch(type) {
switch(type) {
case MCA_PML_OB1_HDR_TYPE_MATCH:
mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments);
break;

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
@ -150,12 +150,17 @@ static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* reque
request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free;
request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel;
request->req_rdma_cnt = 0;
request->local_handle = NULL;
OBJ_CONSTRUCT(&request->lock, opal_mutex_t);
}
static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request)
{
OBJ_DESTRUCT(&request->lock);
if (OPAL_UNLIKELY(request->local_handle)) {
mca_bml_base_deregister_mem (request->rdma_bml, request->local_handle);
request->local_handle = NULL;
}
}
OBJ_CLASS_INSTANCE(
@ -183,31 +188,27 @@ static void mca_pml_ob1_recv_ctl_completion( mca_btl_base_module_t* btl,
* Put operation has completed remotely - update request status
*/
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_size)
{
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata;
size_t bytes_received = 0;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
if( OPAL_LIKELY(status == OMPI_SUCCESS) ) {
bytes_received = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count, 0);
}
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
mca_bml_base_free(bml_btl, des);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
/* check completion status */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
if(recv_request_pml_complete_check(recvreq) == false &&
if (OPAL_LIKELY(0 < rdma_size)) {
assert (rdma_size == frag->rdma_length);
/* check completion status */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size);
if (recv_request_pml_complete_check(recvreq) == false &&
recvreq->req_rdma_offset < recvreq->req_send_offset) {
/* schedule additional rdma operations */
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
/* schedule additional rdma operations */
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
}
}
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
@ -218,7 +219,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_recv_request_ack_send_btl(
ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma)
uint64_t size, bool nordma)
{
mca_btl_base_descriptor_t* des;
mca_pml_ob1_ack_hdr_t* ack;
@ -234,12 +235,9 @@ int mca_pml_ob1_recv_request_ack_send_btl(
}
/* fill out header */
ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval;
ack->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK;
ack->hdr_common.hdr_flags = nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0;
ack->hdr_src_req.lval = hdr_src_req;
ack->hdr_dst_req.pval = hdr_dst_req;
ack->hdr_send_offset = hdr_send_offset;
ack = (mca_pml_ob1_ack_hdr_t*)des->des_segments->seg_addr.pval;
mca_pml_ob1_ack_hdr_prepare (ack, nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0,
hdr_src_req, hdr_dst_req, hdr_send_offset, size);
ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc);
@ -313,63 +311,99 @@ static int mca_pml_ob1_recv_request_ack(
if(recvreq->req_send_offset == hdr->hdr_msg_length)
return OMPI_SUCCESS;
}
/* let know to shedule function there is no need to put ACK flag */
recvreq->req_ack_sent = true;
return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval,
recvreq, recvreq->req_send_offset,
recvreq, recvreq->req_send_offset, 0,
recvreq->req_send_offset == bytes_received);
}
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag);
static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
if (OMPI_ERR_NOT_AVAILABLE == rc) {
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_recv_request_put_frag (frag);
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append (&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
}
if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
/* tell peer to fall back on send for this region */
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, frag->rdma_offset, frag->rdma_length, false);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
}
/**
* Return resources used by the RDMA
*/
static void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *cbdata, int status)
{
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req;
mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* TSW - FIX */
OMPI_ERROR_LOG(status);
ompi_rte_abort(-1, NULL);
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
status = mca_pml_ob1_recv_request_get_frag_failed (frag, status);
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
/* TSW - FIX */
OMPI_ERROR_LOG(status);
ompi_rte_abort(-1, NULL);
}
} else {
/* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
/* TODO: re-add order */
mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag,
frag->rdma_length, 0, 0);
recv_request_pml_complete_check(recvreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
}
/* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
if (recvreq->req_recv.req_bytes_packed <= recvreq->req_bytes_received) {
mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rget.hdr_des,
des->order, 0);
}
recv_request_pml_complete_check(recvreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
mca_btl_base_descriptor_t *dst) {
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag)
{
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *ctl;
mca_pml_ob1_rdma_hdr_t *hdr;
size_t seg_size;
size_t reg_size;
int rc;
seg_size = bml_btl->btl->btl_seg_size * dst->des_local_count;
reg_size = bml_btl->btl->btl_registration_handle_size;
/* prepare a descriptor for rdma control message */
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size,
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL);
if (OPAL_UNLIKELY(NULL == ctl)) {
@ -378,26 +412,19 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
/* fill in rdma header */
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_segments->seg_addr.pval;
mca_pml_ob1_rdma_hdr_prepare (hdr, (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0,
recvreq->remote_req_send.lval, frag, recvreq, frag->rdma_offset,
frag->local_address, frag->rdma_length, frag->local_handle,
reg_size);
hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_des.pval = dst;
hdr->hdr_recv_req.pval = recvreq;
frag->cbfunc = mca_pml_ob1_put_completion;
hdr->hdr_seg_cnt = dst->des_local_count;
recvreq->req_ack_sent = true;
/* copy segments */
memcpy (hdr + 1, dst->des_local, seg_size);
dst->des_cbfunc = mca_pml_ob1_put_completion;
dst->des_cbdata = recvreq;
if (!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), size,
PERUSE_RECV);
/* send rdma request to peer */
rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
@ -412,71 +439,30 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
/*
*
*/
int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag )
int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag)
{
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req;
mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t* descriptor;
size_t save_size = frag->rdma_length;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
int rc;
/* prepare descriptor */
mca_bml_base_prepare_dst( bml_btl,
NULL,
&recvreq->req_recv.req_base.req_convertor,
MCA_BTL_NO_ORDER,
0,
&frag->rdma_length,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
MCA_BTL_DES_FLAGS_GET,
&descriptor );
if( OPAL_UNLIKELY(NULL == descriptor) ) {
if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
frag->rdma_length = save_size;
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
/* tell peer to fall back on send */
recvreq->req_send_offset = 0;
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, recvreq->req_send_offset, true);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
if (bml_btl->btl->btl_register_mem && !frag->local_handle) {
mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, MCA_BTL_REG_FLAG_LOCAL_WRITE |
MCA_BTL_REG_FLAG_REMOTE_WRITE, &frag->local_handle);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
}
}
descriptor->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
descriptor->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
descriptor->des_cbfunc = mca_pml_ob1_rget_completion;
descriptor->des_cbdata = frag;
PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base),
&(((mca_pml_ob1_recv_request_t *) frag->rdma_req)->req_recv.req_base),
frag->rdma_length, PERUSE_RECV);
/* queue up get request */
rc = mca_bml_base_get(bml_btl,descriptor);
rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, frag->local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, MCA_BTL_NO_ORDER, mca_pml_ob1_rget_completion, frag);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) {
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
}
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
mca_bml_base_free(bml_btl, descriptor);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending,
(opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
}
return OMPI_SUCCESS;
@ -502,6 +488,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
sizeof(mca_pml_ob1_frag_hdr_t));
data_offset = hdr->hdr_frag.hdr_frag_offset;
/*
* Make user buffer accessible(defined) before unpacking.
*/
@ -573,7 +560,7 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
/* Store the receive request in unused context pointer. */
des->des_context = (void *)recvreq;
/* Store the amount of bytes in unused remote count value */
des->des_remote_count = bytes_delivered;
des->des_segment_count = bytes_delivered;
/* Then record an event that will get triggered by a PML progress call which
* checks the stream events. If we get an error, abort. Should get message
* from CUDA code about what went wrong. */
@ -598,7 +585,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
int status )
{
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context;
size_t bytes_received = des->des_remote_count;
size_t bytes_received = des->des_segment_count;
OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des));
/* Call into the BTL so it can free the descriptor. At this point, it is
@ -629,7 +616,6 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval;
mca_bml_base_endpoint_t* bml_endpoint = NULL;
size_t bytes_remaining, prev_sent, offset;
mca_btl_base_segment_t *r_segments;
mca_pml_ob1_rdma_frag_t *frag;
mca_bml_base_btl_t *rdma_bml;
int rc;
@ -637,6 +623,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
prev_sent = offset = 0;
bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_send_offset = 0;
MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
@ -680,8 +667,10 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL);
}
bytes_remaining = mca_pml_ob1_compute_segment_length_remote (btl->btl_seg_size, (void *)(hdr + 1),
hdr->hdr_seg_cnt, recvreq->req_recv.req_base.req_proc);
bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
/* save the request for put fallback */
recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
/* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num
* of bytes left to be send. In each iteration we send the max possible bytes supported
@ -690,7 +679,12 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
* the next iteration with the updated size.
* Also - In each iteration we update the location in the buffer to be used for writing
* the message ,and the location to read from. This is done using the offset variable that
* accumulates the number of bytes that were sent so far. */
* accumulates the number of bytes that were sent so far.
*
* NTH: This fragmentation may go away if we change the btls to require them to handle
* get fragmentation internally. This is a reasonable solution since some btls do not
* need any fragmentation (sm, vader, self, etc). Remove this loop if this ends up
* being the case. */
while (bytes_remaining > 0) {
/* allocate/initialize a fragment */
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
@ -700,29 +694,31 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL);
}
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
/* update the read location -- NTH: note this will only work if there is exactly one
segment. TODO -- make this work with multiple segments */
r_segments = (mca_btl_base_segment_t *) frag->rdma_segs;
r_segments->seg_addr.lval += offset;
/* update the read location */
frag->remote_address = hdr->hdr_src_ptr + offset;
/* updating the write location */
OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address);
OPAL_THREAD_UNLOCK(&recvreq->lock);
frag->rdma_bml = rdma_bml;
frag->rdma_hdr.hdr_rget = *hdr;
frag->retries = 0;
frag->rdma_req = recvreq;
frag->rdma_ep = bml_endpoint;
frag->rdma_state = MCA_PML_OB1_RDMA_GET;
frag->reg = NULL;
frag->rdma_length = bytes_remaining;
frag->retries = 0;
frag->rdma_req = recvreq;
frag->rdma_state = MCA_PML_OB1_RDMA_GET;
frag->local_handle = NULL;
frag->rdma_offset = offset;
if (bytes_remaining > rdma_bml->btl->btl_get_limit) {
frag->rdma_length = rdma_bml->btl->btl_get_limit;
} else {
frag->rdma_length = bytes_remaining;
}
/* NTH: TODO -- handle error conditions gracefully */
rc = mca_pml_ob1_recv_request_get_frag(frag);
@ -921,13 +917,11 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
while(bytes_remaining > 0 &&
recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
size_t size, seg_size;
mca_pml_ob1_rdma_hdr_t* hdr;
mca_btl_base_descriptor_t* dst;
mca_btl_base_descriptor_t* ctl;
mca_mpool_base_registration_t * reg = NULL;
mca_btl_base_module_t* btl;
mca_pml_ob1_rdma_frag_t *frag = NULL;
mca_btl_base_module_t *btl;
int rc, rdma_idx;
void *data_ptr;
size_t size;
if(prev_bytes_remaining == bytes_remaining) {
if(++num_fail == num_tries) {
@ -948,86 +942,62 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
do {
rdma_idx = recvreq->req_rdma_idx;
bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
reg = recvreq->req_rdma[rdma_idx].btl_reg;
size = recvreq->req_rdma[rdma_idx].length;
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
recvreq->req_rdma_idx = 0;
} while(!size);
btl = bml_btl->btl;
/* makes sure that we don't exceed BTL max rdma size
* if memory is not pinned already */
if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) &&
(size > btl->btl_rdma_pipeline_frag_size)) {
/* NTH: This conditional used to check if there was a registration in
* recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to
* the btl not needed registration (equivalent to btl->btl_register_mem
* != NULL. This new check is equivalent. Note: I feel this protocol
* needs work to better improve resource usage when running with a
* leave pinned protocol. */
if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) &&
(size > btl->btl_rdma_pipeline_frag_size)) {
size = btl->btl_rdma_pipeline_frag_size;
}
/* take lock to protect converter against concurrent access
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
continue;
}
/* take lock to protect convertor against concurrent access
* from unpack */
OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor,
&recvreq->req_rdma_offset );
/* prepare a descriptor for RDMA */
mca_bml_base_prepare_dst(bml_btl, reg,
&recvreq->req_recv.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_PUT, &dst);
opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor,
&recvreq->req_rdma_offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
OPAL_THREAD_UNLOCK(&recvreq->lock);
if(OPAL_UNLIKELY(dst == NULL)) {
continue;
if (btl->btl_register_mem) {
mca_bml_base_register_mem (bml_btl, data_ptr, size, MCA_BTL_REG_FLAG_REMOTE_WRITE,
&frag->local_handle);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
continue;
}
}
dst->des_cbfunc = mca_pml_ob1_put_completion;
dst->des_cbdata = recvreq;
/* fill in the minimum information needed to handle the fin message */
frag->cbfunc = mca_pml_ob1_put_completion;
frag->rdma_length = size;
frag->rdma_req = recvreq;
frag->rdma_bml = bml_btl;
frag->local_address = data_ptr;
frag->rdma_offset = recvreq->req_rdma_offset;
seg_size = btl->btl_seg_size * dst->des_local_count;
/* prepare a descriptor for rdma control message */
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL);
if( OPAL_UNLIKELY(NULL == ctl) ) {
mca_bml_base_free(bml_btl,dst);
continue;
}
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
/* fill in rdma header */
hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_local->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
hdr->hdr_req = recvreq->remote_req_send;
hdr->hdr_des.pval = dst;
hdr->hdr_recv_req.pval = recvreq;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_seg_cnt = dst->des_local_count;
/* copy segments */
memmove (hdr + 1, dst->des_local, seg_size);
if(!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc);
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), size,
PERUSE_RECV);
/* send rdma request to peer */
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
if( OPAL_LIKELY( rc >= 0 ) ) {
rc = mca_pml_ob1_recv_request_put_frag (frag);
if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
/* update request state */
recvreq->req_rdma_offset += size;
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
recvreq->req_rdma[rdma_idx].length -= size;
bytes_remaining -= size;
} else {
mca_bml_base_free(bml_btl,ctl);
mca_bml_base_free(bml_btl,dst);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
}
}

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
@ -131,7 +132,7 @@ do { \
#define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \
{ \
MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
(ompi_free_list_item_t*)(recvreq)); \
}
@ -154,9 +155,11 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
}
for(i = 0; i < recvreq->req_rdma_cnt; i++) {
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg;
if( NULL != btl_reg && btl_reg->mpool != NULL) {
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
struct mca_btl_base_registration_handle_t *handle = recvreq->req_rdma[i].btl_reg;
mca_bml_base_btl_t *bml_btl = recvreq->req_rdma[i].bml_btl;
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
}
}
recvreq->req_rdma_cnt = 0;
@ -178,6 +181,10 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
MPI_ERR_TRUNCATE;
}
if (OPAL_UNLIKELY(recvreq->local_handle)) {
mca_bml_base_deregister_mem (recvreq->rdma_bml, recvreq->local_handle);
recvreq->local_handle = NULL;
}
MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq);
}
OPAL_THREAD_UNLOCK(&ompi_request_lock);
@ -387,7 +394,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
(void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
}
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \
do { \
mca_pml_ob1_pckt_pending_t *_pckt; \
\
@ -396,6 +403,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \
_pckt->hdr.hdr_ack.hdr_send_size = (Sz); \
_pckt->proc = (P); \
_pckt->bml_btl = NULL; \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
@ -406,11 +414,11 @@ static inline void mca_pml_ob1_recv_request_schedule(
int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_rdma_offset, bool nordma);
uint64_t hdr_rdma_offset, uint64_t size, bool nordma);
static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma)
uint64_t size, bool nordma)
{
size_t i;
mca_bml_base_btl_t* bml_btl;
@ -420,12 +428,12 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS)
hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS)
return OMPI_SUCCESS;
}
MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
hdr_send_offset);
hdr_send_offset, size);
return OMPI_ERR_OUT_OF_RESOURCE;
}

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -137,6 +137,7 @@ static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req)
req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel;
req->req_rdma_cnt = 0;
req->req_throttle_sends = false;
req->rdma_frag = NULL;
OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t);
OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t);
}
@ -145,6 +146,10 @@ static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req)
{
OBJ_DESTRUCT(&req->req_send_ranges);
OBJ_DESTRUCT(&req->req_send_range_lock);
if (req->rdma_frag) {
MCA_PML_OB1_RDMA_FRAG_RETURN(req->rdma_frag);
req->rdma_frag = NULL;
}
}
OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t,
@ -236,10 +241,9 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
* happens in one thread, the increase of the req_bytes_delivered does not
* have to be atomic.
*/
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count,
sizeof(mca_pml_ob1_rendezvous_hdr_t));
req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
des->des_segment_count,
sizeof(mca_pml_ob1_rendezvous_hdr_t));
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
}
@ -250,27 +254,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
*/
static void
mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length)
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
size_t req_bytes_delivered;
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
/* count bytes of user data actually delivered and check for request completion */
if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count, 0);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
if (OPAL_LIKELY(0 < rdma_length)) {
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length);
}
sendreq->src_des = NULL;
send_request_pml_complete_check(sendreq);
/* free the descriptor */
mca_bml_base_free(bml_btl, des);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
@ -314,10 +309,9 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
}
/* count bytes of user data actually delivered */
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count,
sizeof(mca_pml_ob1_frag_hdr_t));
req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
des->des_segment_count,
sizeof(mca_pml_ob1_frag_hdr_t));
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
@ -389,7 +383,7 @@ int mca_pml_ob1_send_request_start_buffered(
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* pack the data into the BTL supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
@ -408,17 +402,14 @@ int mca_pml_ob1_send_request_start_buffered(
/* build rendezvous header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_send.req_bytes_packed, sendreq);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
/* update lengths */
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
@ -491,15 +482,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
if(NULL != bml_btl->btl->btl_sendi) {
mca_pml_ob1_match_hdr_t match;
match.hdr_common.hdr_flags = 0;
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
match.hdr_tag = sendreq->req_send.req_base.req_tag;
match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence);
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
/* try to send immediately */
rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
@ -532,7 +521,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
if(size > 0) {
/* pack the data into the supplied buffer */
@ -566,15 +555,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
/* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
/* update lengths */
segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data;
@ -618,7 +605,6 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
/* prepare descriptor */
mca_bml_base_prepare_src( bml_btl,
NULL,
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER,
OMPI_PML_OB1_MATCH_HDR_LEN,
@ -628,19 +614,17 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
/* short message */
des->des_cbfunc = mca_pml_ob1_match_completion_free;
@ -674,80 +658,68 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
* one RDMA capable BTLs). This way round robin distribution of RDMA
* operation is achieved.
*/
mca_btl_base_descriptor_t *des, *src = NULL;
mca_btl_base_registration_handle_t *local_handle;
mca_btl_base_descriptor_t *des;
mca_pml_ob1_rdma_frag_t *frag;
mca_pml_ob1_rget_hdr_t *hdr;
size_t seg_size;
size_t reg_size;
void *data_ptr;
int rc;
sendreq->src_des = NULL;
bml_btl = sendreq->req_rdma[0].bml_btl;
if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
sendreq->rdma_frag = NULL;
/* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */
return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG |
MCA_PML_OB1_HDR_FLAGS_PIN);
}
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_defined,
sendreq->req_send.req_base.req_addr,
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype);
);
/* prepare source descriptor/segment(s) */
/* PML owns this descriptor and will free it in */
/* mca_pml_ob1_rget_completion */
mca_bml_base_prepare_src( bml_btl, sendreq->req_rdma[0].btl_reg,
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_GET |
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &src );
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
sendreq->req_send.req_base.req_addr,
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype);
);
if( OPAL_UNLIKELY(NULL == src) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
/* at this time ob1 does not support non-contiguous gets. the convertor represents a
* contiguous block of memory */
opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
local_handle = sendreq->req_rdma[0].btl_reg;
/* allocate an rdma fragment to keep track of the request size for use in the fin message */
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
src->des_cbfunc = mca_pml_ob1_rget_completion;
src->des_cbdata = sendreq;
sendreq->src_des = src;
/* fill in necessary fragment data */
frag->rdma_req = sendreq;
frag->rdma_bml = bml_btl;
frag->rdma_length = size;
frag->cbfunc = mca_pml_ob1_rget_completion;
/* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count;
/* save the fragment for get->put fallback */
sendreq->rdma_frag = frag;
reg_size = bml_btl->btl->btl_registration_handle_size;
/* allocate space for get hdr + segment list */
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size,
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_SIGNAL);
if( OPAL_UNLIKELY(NULL == des) ) {
/* NTH: no need to reset the converter here. it will be reset before it is retried */
mca_bml_base_free(bml_btl, src);
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* build match header */
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval;
hdr->hdr_rndv.hdr_match.hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN;
hdr->hdr_rndv.hdr_match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET;
hdr->hdr_rndv.hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_rndv.hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_rndv.hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_rndv.hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
hdr->hdr_des.pval = src;
hdr->hdr_seg_cnt = src->des_local_count;
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_segments->seg_addr.pval;
/* TODO -- Add support for multiple segments for get */
mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_send.req_bytes_packed, sendreq,
frag, data_ptr, local_handle, reg_size);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc);
/* copy segment data */
memcpy (hdr + 1, src->des_local, seg_size);
des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
des->des_cbdata = sendreq;
@ -765,12 +737,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET);
if (OPAL_UNLIKELY(rc < 0)) {
mca_bml_base_free(bml_btl, des);
if (sendreq->src_des) {
mca_bml_base_free (bml_btl, sendreq->src_des);
sendreq->src_des = NULL;
}
return rc;
}
@ -808,7 +774,6 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_send.req_base.req_datatype);
);
mca_bml_base_prepare_src( bml_btl,
NULL,
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_rendezvous_hdr_t),
@ -827,21 +792,19 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
segment = des->des_local;
segment = des->des_segments;
/* build hdr */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = flags | MCA_PML_OB1_HDR_FLAGS_SIGNAL;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV;
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags |
MCA_PML_OB1_HDR_FLAGS_SIGNAL,
sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank,
sendreq->req_send.req_base.req_tag,
(uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_send.req_bytes_packed, sendreq);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
/* first fragment of a long message */
des->des_cbdata = sendreq;
@ -1022,13 +985,10 @@ cannot_pack:
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype);
);
mca_bml_base_prepare_src(bml_btl, NULL,
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_frag_hdr_t),
mca_bml_base_prepare_src(bml_btl, &sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t),
&size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
MCA_BTL_DES_FLAGS_SIGNAL,
&des);
MCA_BTL_DES_FLAGS_SIGNAL, &des);
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
sendreq->req_send.req_base.req_addr,
@ -1051,12 +1011,9 @@ cannot_pack:
des->des_cbdata = sendreq;
/* setup header */
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG;
hdr->hdr_frag_offset = range->range_send_offset;
hdr->hdr_src_req.pval = sendreq;
hdr->hdr_dst_req = sendreq->req_recv;
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_segments->seg_addr.pval;
mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq,
sendreq->req_recv.lval);
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG,
sendreq->req_send.req_base.req_proc);
@ -1113,38 +1070,66 @@ cannot_pack:
}
/**
* A put fragment could not be started. Queue the fragment to be retried later or
* fall back on send/recv.
*/
static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
/* queue the frag for later if there was a resource error */
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
/* tell receiver to deregister memory */
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if (NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule (sendreq);
}
}
/**
* An RDMA put operation has completed:
* (1) Update request status and if required set completed
* (2) Send FIN control message to the destination
* (2) Send FIN control message to the destination
*/
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *cbdata, int status)
{
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata;
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* TSW - FIX */
OMPI_ERROR_LOG(status);
ompi_rte_abort(-1, NULL);
if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) {
/* TODO -- readd ordering */
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length,
0, 0);
/* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
send_request_pml_complete_check(sendreq);
} else {
/* try to fall back on send/recv */
mca_pml_ob1_send_request_put_frag_failed (frag, status);
}
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_des,
des->order, 0);
/* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
send_request_pml_complete_check(sendreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -1152,81 +1137,45 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req;
mca_mpool_base_registration_t *reg = NULL;
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_btl_base_registration_handle_t *local_handle = NULL;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *des;
size_t save_size = frag->rdma_length;
int rc;
if (OPAL_LIKELY(NULL == sendreq->src_des)) {
/* setup descriptor */
mca_bml_base_prepare_src( bml_btl,
reg,
&frag->convertor,
MCA_BTL_NO_ORDER,
0,
&frag->rdma_length,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_PUT,
&des );
if( OPAL_UNLIKELY(NULL == des) ) {
if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
frag->rdma_length = save_size;
opal_convertor_set_position(&frag->convertor, &offset);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
mca_pml_ob1_send_request_t *sendreq =
(mca_pml_ob1_send_request_t*)frag->rdma_req;
if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) {
/* Check if the segment is already registered */
for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) {
if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
/* do not copy the handle to the fragment to avoid deregistring it twice */
local_handle = sendreq->req_rdma[i].btl_reg;
break;
}
}
/* tell receiver to unregister memory */
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
MCA_BTL_NO_ORDER, 1);
if (NULL == frag->local_handle) {
/* Not already registered. Register the region with the BTL. */
mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0,
&frag->local_handle);
/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq,
frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if(NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule(sendreq);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
return OMPI_ERR_OUT_OF_RESOURCE;
}
return OMPI_ERR_OUT_OF_RESOURCE;
local_handle = frag->local_handle;
}
} else {
/* already have a source descriptor */
des = sendreq->src_des;
sendreq->src_des = NULL;
}
des->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
des->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
des->des_cbfunc = mca_pml_ob1_put_completion;
des->des_cbdata = frag;
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
rc = mca_bml_base_put(bml_btl, des);
rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
mca_bml_base_free(bml_btl, des);
frag->rdma_length = save_size;
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
/* TSW - FIX */
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
mca_pml_ob1_send_request_put_frag_failed (frag, rc);
return rc;
}
return OMPI_SUCCESS;
@ -1240,12 +1189,11 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
*/
void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
mca_btl_base_module_t* btl,
mca_btl_base_module_t* btl,
mca_pml_ob1_rdma_hdr_t* hdr )
{
mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
mca_pml_ob1_rdma_frag_t* frag;
size_t i, size = 0;
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) {
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
@ -1253,61 +1201,36 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_recv.pval = hdr->hdr_recv_req.pval;
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if (NULL == sendreq->rdma_frag) {
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if( OPAL_UNLIKELY(NULL == frag) ) {
/* TSW - FIX */
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
ompi_rte_abort(-1, NULL);
}
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
/* setup fragment */
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
for( i = 0; i < hdr->hdr_seg_cnt; i++ ) {
mca_btl_base_segment_t *seg = (mca_btl_base_segment_t *) ((uintptr_t)(frag->rdma_segs) + i * btl->btl_seg_size);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if ((sendreq->req_send.req_base.req_proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
(ompi_proc_local()->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
size += opal_swap_bytes4(seg->seg_len);
} else
#endif
{
size += seg->seg_len;
if( OPAL_UNLIKELY(NULL == frag) ) {
/* TSW - FIX */
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
ompi_rte_abort(-1, NULL);
}
} else {
/* rget fallback on put */
frag = sendreq->rdma_frag;
sendreq->rdma_frag = NULL;
sendreq->req_state = 0;
}
/* copy registration data */
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
frag->rdma_hdr.hdr_rdma = *hdr;
frag->rdma_req = sendreq;
frag->rdma_ep = bml_endpoint;
frag->rdma_length = size;
frag->rdma_length = hdr->hdr_dst_size;
frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
frag->reg = NULL;
frag->remote_address = hdr->hdr_dst_ptr;
frag->retries = 0;
if (OPAL_UNLIKELY(NULL != sendreq->src_des)) {
/* get fallback path */
sendreq->req_state = 0;
}
/* lookup the corresponding registration */
for(i=0; i<sendreq->req_rdma_cnt; i++) {
if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
frag->reg = sendreq->req_rdma[i].btl_reg;
break;
}
}
/* RDMA writes may proceed in parallel to send and to each other, so
* create clone of the convertor for each RDMA fragment
*/
size = hdr->hdr_rdma_offset;
opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
&frag->convertor, 0, &size);
/* Get the address of the current offset. Note: at this time ob1 CAN NOT handle
* non-contiguous RDMA. If that changes this code will be wrong. */
opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor,
hdr->hdr_rdma_offset, &frag->local_address);
mca_pml_ob1_send_request_put_frag(frag);
}

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -54,7 +54,7 @@ struct mca_pml_ob1_send_request_t {
mca_pml_ob1_send_pending_t req_pending;
opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges;
mca_btl_base_descriptor_t *src_des;
mca_pml_ob1_rdma_frag_t *rdma_frag;
mca_pml_ob1_com_btl_t req_rdma[1];
};
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
@ -124,10 +124,9 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
ompi_free_list_item_t* item; \
\
if( OPAL_LIKELY(NULL != proc) ) { \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
sendreq = (mca_pml_ob1_send_request_t*)item; \
sendreq->req_send.req_base.req_proc = proc; \
sendreq->src_des = NULL; \
} \
}
@ -163,15 +162,18 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
assert( 0 == _position ); \
}
static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* sendreq)
static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq)
{
size_t r;
/* return mpool resources */
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
if( NULL != reg && reg->mpool != NULL ) {
reg->mpool->mpool_deregister(reg->mpool, reg);
struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg;
mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
sendreq->req_rdma[r].btl_reg = NULL;
}
}
sendreq->req_rdma_cnt = 0;
@ -218,10 +220,14 @@ do {
#define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \
do { \
/* Let the base handle the reference counts */ \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
(ompi_free_list_item_t*)sendreq); \
/* Let the base handle the reference counts */ \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
if (sendreq->rdma_frag) { \
MCA_PML_OB1_RDMA_FRAG_RETURN (sendreq->rdma_frag); \
sendreq->rdma_frag = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
(ompi_free_list_item_t*)sendreq); \
} while(0)