1
1
Этот коммит содержится в:
Nathan Hjelm 2014-10-30 13:54:06 -06:00 коммит произвёл Nathan Hjelm
родитель 66bd698eaf
Коммит b75bb8aea7
12 изменённых файлов: 644 добавлений и 677 удалений

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -500,17 +500,17 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
case MCA_PML_OB1_HDR_TYPE_RGET: case MCA_PML_OB1_HDR_TYPE_RGET:
type = "RGET"; type = "RGET";
snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64 snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64
"seg_cnt %d hdr_des %" PRIu64, "seg_cnt %d frag %" PRIu64 " src_ptr %" PRIu64,
hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src, hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src,
hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq, hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq,
hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval,
hdr->hdr_rget.hdr_seg_cnt, hdr->hdr_rget.hdr_des.lval); hdr->hdr_rget.hdr_src_ptr);
break; break;
case MCA_PML_OB1_HDR_TYPE_ACK: case MCA_PML_OB1_HDR_TYPE_ACK:
type = "ACK"; type = "ACK";
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64, snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64,
hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval, hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval,
hdr->hdr_ack.hdr_send_offset); hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size);
break; break;
case MCA_PML_OB1_HDR_TYPE_FRAG: case MCA_PML_OB1_HDR_TYPE_FRAG:
type = "FRAG"; type = "FRAG";
@ -520,10 +520,11 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
break; break;
case MCA_PML_OB1_HDR_TYPE_PUT: case MCA_PML_OB1_HDR_TYPE_PUT:
type = "PUT"; type = "PUT";
snprintf( header, 128, "seg_cnt %d dst_req %p src_des %p recv_req %p offset %" PRIu64 " [%p %" PRIu64 "]", snprintf( header, 128, "seg_cnt %d dst_req %p src_frag %p recv_req %p offset %" PRIu64
hdr->hdr_rdma.hdr_seg_cnt, hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_des.pval, " dst_ptr %" PRIu64 " dst_size %" PRIu64,
hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval,
hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset, hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset,
hdr->hdr_rdma.hdr_segs[0].seg_addr.pval, hdr->hdr_rdma.hdr_segs[0].seg_len); hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size);
break; break;
case MCA_PML_OB1_HDR_TYPE_FIN: case MCA_PML_OB1_HDR_TYPE_FIN:
type = "FIN"; type = "FIN";
@ -638,7 +639,8 @@ static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl,
*/ */
int mca_pml_ob1_send_fin( ompi_proc_t* proc, int mca_pml_ob1_send_fin( ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des, opal_ptr_t hdr_frag,
uint64_t rdma_size,
uint8_t order, uint8_t order,
uint32_t status ) uint32_t status )
{ {
@ -650,18 +652,15 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if(NULL == fin) { if(NULL == fin) {
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
fin->des_cbfunc = mca_pml_ob1_fin_completion; fin->des_cbfunc = mca_pml_ob1_fin_completion;
fin->des_cbdata = NULL; fin->des_cbdata = NULL;
/* fill in header */ /* fill in header */
hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_local->seg_addr.pval; mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_local->seg_addr.pval,
hdr->hdr_common.hdr_flags = 0; 0, hdr_frag.lval, status ? status : rdma_size);
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des;
hdr->hdr_fail = status;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc);
@ -676,7 +675,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
mca_bml_base_free(bml_btl, fin); mca_bml_base_free(bml_btl, fin);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
@ -717,6 +716,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
pckt->hdr.hdr_ack.hdr_src_req.lval, pckt->hdr.hdr_ack.hdr_src_req.lval,
pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_dst_req.pval,
pckt->hdr.hdr_ack.hdr_send_offset, pckt->hdr.hdr_ack.hdr_send_offset,
pckt->hdr.hdr_ack.hdr_send_size,
pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA); pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
@ -728,9 +728,10 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
break; break;
case MCA_PML_OB1_HDR_TYPE_FIN: case MCA_PML_OB1_HDR_TYPE_FIN:
rc = mca_pml_ob1_send_fin(pckt->proc, send_dst, rc = mca_pml_ob1_send_fin(pckt->proc, send_dst,
pckt->hdr.hdr_fin.hdr_des, pckt->hdr.hdr_fin.hdr_frag,
pckt->hdr.hdr_fin.hdr_size,
pckt->order, pckt->order,
pckt->hdr.hdr_fin.hdr_fail); pckt->status);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
return; return;
} }

Просмотреть файл

@ -216,6 +216,7 @@ struct mca_pml_ob1_pckt_pending_t {
mca_pml_ob1_hdr_t hdr; mca_pml_ob1_hdr_t hdr;
struct mca_bml_base_btl_t *bml_btl; struct mca_bml_base_btl_t *bml_btl;
uint8_t order; uint8_t order;
int status;
}; };
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t; typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t); OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
@ -234,17 +235,17 @@ do { \
(ompi_free_list_item_t*)pckt); \ (ompi_free_list_item_t*)pckt); \
} while(0) } while(0)
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \ #define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, Sz, B, O, S) \
do { \ do { \
mca_pml_ob1_pckt_pending_t *_pckt; \ mca_pml_ob1_pckt_pending_t *_pckt; \
\ \
MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \ MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \ mca_pml_ob1_fin_hdr_prepare (&_pckt->hdr.hdr_fin, 0, \
_pckt->hdr.hdr_fin.hdr_des = (D); \ (D).lval, (Sz)); \
_pckt->hdr.hdr_fin.hdr_fail = (S); \
_pckt->proc = (P); \ _pckt->proc = (P); \
_pckt->bml_btl = (B); \ _pckt->bml_btl = (B); \
_pckt->order = (O); \ _pckt->order = (O); \
_pckt->status = (S); \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
opal_list_append(&mca_pml_ob1.pckt_pending, \ opal_list_append(&mca_pml_ob1.pckt_pending, \
(opal_list_item_t*)_pckt); \ (opal_list_item_t*)_pckt); \
@ -253,7 +254,7 @@ do { \
int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des, uint8_t order, uint32_t status); opal_ptr_t hdr_frag, uint64_t size, uint8_t order, uint32_t status);
/* This function tries to resend FIN/ACK packets from pckt_pending queue. /* This function tries to resend FIN/ACK packets from pckt_pending queue.
* Packets are added to the queue when sending of FIN or ACK is failed due to * Packets are added to the queue when sending of FIN or ACK is failed due to
@ -338,7 +339,7 @@ mca_pml_ob1_compute_segment_length_remote (size_t seg_size, void *segments,
/* represent BTL chosen for sending request */ /* represent BTL chosen for sending request */
struct mca_pml_ob1_com_btl_t { struct mca_pml_ob1_com_btl_t {
mca_bml_base_btl_t *bml_btl; mca_bml_base_btl_t *bml_btl;
struct mca_mpool_base_registration_t* btl_reg; struct mca_btl_base_registration_handle_t *btl_reg;
size_t length; size_t length;
}; };
typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t; typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -63,6 +63,13 @@ struct mca_pml_ob1_common_hdr_t {
}; };
typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t; typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t;
static inline void mca_pml_ob1_common_hdr_prepare (mca_pml_ob1_common_hdr_t *hdr, uint8_t hdr_type,
uint8_t hdr_flags)
{
hdr->hdr_type = hdr_type;
hdr->hdr_flags = hdr_flags;
}
#define MCA_PML_OB1_COMMON_HDR_NTOH(h) #define MCA_PML_OB1_COMMON_HDR_NTOH(h)
#define MCA_PML_OB1_COMMON_HDR_HTON(h) #define MCA_PML_OB1_COMMON_HDR_HTON(h)
@ -88,15 +95,19 @@ struct mca_pml_ob1_match_hdr_t {
typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t; typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t;
static inline void mca_pml_ob1_match_hdr_prepare (mca_pml_ob1_match_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, hdr_type, hdr_flags);
hdr->hdr_ctx = hdr_ctx;
hdr->hdr_src = hdr_src;
hdr->hdr_tag = hdr_tag;
hdr->hdr_seq = hdr_seq;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_MATCH_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ #endif
(h).hdr_padding[1] = 0; \ }
} while(0)
#else
#define MCA_PML_OB1_MATCH_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_MATCH_HDR_NTOH(h) \ #define MCA_PML_OB1_MATCH_HDR_NTOH(h) \
do { \ do { \
@ -110,7 +121,6 @@ do { \
#define MCA_PML_OB1_MATCH_HDR_HTON(h) \ #define MCA_PML_OB1_MATCH_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_MATCH_HDR_FILL(h); \
(h).hdr_ctx = htons((h).hdr_ctx); \ (h).hdr_ctx = htons((h).hdr_ctx); \
(h).hdr_src = htonl((h).hdr_src); \ (h).hdr_src = htonl((h).hdr_src); \
(h).hdr_tag = htonl((h).hdr_tag); \ (h).hdr_tag = htonl((h).hdr_tag); \
@ -129,12 +139,14 @@ struct mca_pml_ob1_rendezvous_hdr_t {
}; };
typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
#define MCA_PML_OB1_RNDV_HDR_FILL(h) \ uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
MCA_PML_OB1_MATCH_HDR_FILL((h).hdr_match) uint64_t hdr_msg_length, void *hdr_src_req)
#else {
#define MCA_PML_OB1_RNDV_HDR_FILL(h) mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, hdr_type, hdr_flags, hdr_ctx, hdr_src, hdr_tag, hdr_seq);
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ hdr->hdr_msg_length = hdr_msg_length;
hdr->hdr_src_req.pval = hdr_src_req;
}
/* Note that hdr_src_req is not put in network byte order because it /* Note that hdr_src_req is not put in network byte order because it
is never processed by the receiver, other than being copied into is never processed by the receiver, other than being copied into
@ -148,7 +160,6 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#define MCA_PML_OB1_RNDV_HDR_HTON(h) \ #define MCA_PML_OB1_RNDV_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \ MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \
MCA_PML_OB1_RNDV_HDR_FILL(h); \
(h).hdr_msg_length = hton64((h).hdr_msg_length); \ (h).hdr_msg_length = hton64((h).hdr_msg_length); \
} while (0) } while (0)
@ -157,38 +168,47 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
*/ */
struct mca_pml_ob1_rget_hdr_t { struct mca_pml_ob1_rget_hdr_t {
mca_pml_ob1_rendezvous_hdr_t hdr_rndv; mca_pml_ob1_rendezvous_hdr_t hdr_rndv;
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[4]; uint8_t hdr_padding[4];
#endif #endif
opal_ptr_t hdr_des; /**< source descriptor */ opal_ptr_t hdr_frag; /**< source fragment (for fin) */
uint64_t hdr_src_ptr; /**< source pointer */
/* btl registration handle data follows */
}; };
typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t; typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t;
static inline void mca_pml_ob1_rget_hdr_prepare (mca_pml_ob1_rget_hdr_t *hdr, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
uint64_t hdr_msg_length, void *hdr_src_req, void *hdr_frag,
void *hdr_src_ptr, void *local_handle, size_t local_handle_size)
{
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RGET, hdr_flags,
hdr_ctx, hdr_src, hdr_tag, hdr_seq, hdr_msg_length, hdr_src_req);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RGET_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
MCA_PML_OB1_RNDV_HDR_FILL((h).hdr_rndv); \ hdr->hdr_padding[2] = 0;
(h).hdr_padding[0] = 0; \ hdr->hdr_padding[3] = 0;
(h).hdr_padding[1] = 0; \ #endif
(h).hdr_padding[2] = 0; \ hdr->hdr_frag.pval = hdr_frag;
(h).hdr_padding[3] = 0; \ hdr->hdr_src_ptr = (uint64_t)(intptr_t) hdr_src_ptr;
} while(0)
#else /* copy registration handle */
#define MCA_PML_OB1_RGET_HDR_FILL(h) memcpy (hdr + 1, local_handle, local_handle_size);
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ }
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \ #define MCA_PML_OB1_RGET_HDR_NTOH(h) \
do { \ do { \
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \ MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_src_ptr = ntoh64((h).hdr_src_ptr); \
} while (0) } while (0)
#define MCA_PML_OB1_RGET_HDR_HTON(h) \ #define MCA_PML_OB1_RGET_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \ MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
MCA_PML_OB1_RGET_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_src_ptr = hton64((h).hdr_src_ptr); \
} while (0) } while (0)
/** /**
@ -205,19 +225,23 @@ struct mca_pml_ob1_frag_hdr_t {
}; };
typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t; typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t;
static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag_offset, void *hdr_src_req,
uint64_t hdr_dst_req)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FRAG, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FRAG_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ hdr->hdr_padding[2] = 0;
(h).hdr_padding[1] = 0; \ hdr->hdr_padding[3] = 0;
(h).hdr_padding[2] = 0; \ hdr->hdr_padding[4] = 0;
(h).hdr_padding[3] = 0; \ hdr->hdr_padding[5] = 0;
(h).hdr_padding[4] = 0; \ #endif
(h).hdr_padding[5] = 0; \ hdr->hdr_frag_offset = hdr_frag_offset;
} while(0) hdr->hdr_src_req.pval = hdr_src_req;
#else hdr->hdr_dst_req.lval = hdr_dst_req;
#define MCA_PML_OB1_FRAG_HDR_FILL(h) }
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_FRAG_HDR_NTOH(h) \ #define MCA_PML_OB1_FRAG_HDR_NTOH(h) \
do { \ do { \
@ -228,7 +252,6 @@ do { \
#define MCA_PML_OB1_FRAG_HDR_HTON(h) \ #define MCA_PML_OB1_FRAG_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FRAG_HDR_FILL(h); \
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \ (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
} while (0) } while (0)
@ -244,22 +267,28 @@ struct mca_pml_ob1_ack_hdr_t {
opal_ptr_t hdr_src_req; /**< source request */ opal_ptr_t hdr_src_req; /**< source request */
opal_ptr_t hdr_dst_req; /**< matched receive request */ opal_ptr_t hdr_dst_req; /**< matched receive request */
uint64_t hdr_send_offset; /**< starting point of copy in/out */ uint64_t hdr_send_offset; /**< starting point of copy in/out */
uint64_t hdr_send_size; /**< number of bytes requested (0: all remaining) */
}; };
typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t; typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t;
static inline void mca_pml_ob1_ack_hdr_prepare (mca_pml_ob1_ack_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_send_offset, uint64_t hdr_send_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_ACK, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_ACK_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ hdr->hdr_padding[2] = 0;
(h).hdr_padding[1] = 0; \ hdr->hdr_padding[3] = 0;
(h).hdr_padding[2] = 0; \ hdr->hdr_padding[4] = 0;
(h).hdr_padding[3] = 0; \ hdr->hdr_padding[5] = 0;
(h).hdr_padding[4] = 0; \ #endif
(h).hdr_padding[5] = 0; \ hdr->hdr_src_req.lval = hdr_src_req;
} while (0) hdr->hdr_dst_req.pval = hdr_dst_req;
#else hdr->hdr_send_offset = hdr_send_offset;
#define MCA_PML_OB1_ACK_HDR_FILL(h) hdr->hdr_send_size = hdr_send_size;
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ }
/* Note that the request headers are not put in NBO because the /* Note that the request headers are not put in NBO because the
src_req is already in receiver's byte order and the dst_req is not src_req is already in receiver's byte order and the dst_req is not
@ -269,13 +298,14 @@ do { \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \ (h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
(h).hdr_send_size = ntoh64((h).hdr_send_size); \
} while (0) } while (0)
#define MCA_PML_OB1_ACK_HDR_HTON(h) \ #define MCA_PML_OB1_ACK_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_ACK_HDR_FILL(h); \
(h).hdr_send_offset = hton64((h).hdr_send_offset); \ (h).hdr_send_offset = hton64((h).hdr_send_offset); \
(h).hdr_send_size = hton64((h).hdr_send_size); \
} while (0) } while (0)
/** /**
@ -287,38 +317,55 @@ struct mca_pml_ob1_rdma_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */ uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
#endif #endif
uint32_t hdr_seg_cnt; /**< number of segments for rdma */ /* TODO: add real support for multiple destination segments */
opal_ptr_t hdr_req; /**< destination request */ opal_ptr_t hdr_req; /**< destination request */
opal_ptr_t hdr_des; /**< source descriptor */ opal_ptr_t hdr_frag; /**< receiver fragment */
opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */ opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */
uint64_t hdr_rdma_offset; /**< current offset into user buffer */ uint64_t hdr_rdma_offset; /**< current offset into user buffer */
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ uint64_t hdr_dst_ptr; /**< destination address */
uint64_t hdr_dst_size; /**< destination size */
/* registration data follows */
}; };
typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t; typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t;
static inline void mca_pml_ob1_rdma_hdr_prepare (mca_pml_ob1_rdma_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_req, void *hdr_frag, void *hdr_recv_req,
uint64_t hdr_rdma_offset, void *hdr_dst_ptr,
uint64_t hdr_dst_size, void *local_handle,
size_t local_handle_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_PUT, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RDMA_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ #endif
(h).hdr_padding[1] = 0; \ hdr->hdr_req.lval = hdr_req;
} while(0) hdr->hdr_frag.pval = hdr_frag;
#else hdr->hdr_recv_req.pval = hdr_recv_req;
#define MCA_PML_OB1_RDMA_HDR_FILL(h) hdr->hdr_rdma_offset = hdr_rdma_offset;
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ hdr->hdr_dst_ptr = (uint64_t)(intptr_t) hdr_dst_ptr;
hdr->hdr_dst_size = hdr_dst_size;
/* copy segments */
memcpy (hdr + 1, local_handle, local_handle_size);
}
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \ #define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \ (h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \ (h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = ntoh64((h).hdr_dst_ptr); \
(h).hdr_dst_size = ntoh64((h).hdr_dst_size); \
} while (0) } while (0)
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \ #define MCA_PML_OB1_RDMA_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_RDMA_HDR_FILL(h); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \ (h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = hton64((h).hdr_dst_ptr); \
(h).hdr_dst_size = hton64((h).hdr_dst_size); \
} while (0) } while (0)
/** /**
@ -330,30 +377,33 @@ struct mca_pml_ob1_fin_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; uint8_t hdr_padding[2];
#endif #endif
uint32_t hdr_fail; /**< RDMA operation failed */ int64_t hdr_size; /**< number of bytes completed (positive), error code (negative) */
opal_ptr_t hdr_des; /**< completed descriptor */ opal_ptr_t hdr_frag; /**< completed RDMA fragment */
}; };
typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t; typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t;
static inline void mca_pml_ob1_fin_hdr_prepare (mca_pml_ob1_fin_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag, int64_t hdr_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FIN, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FIN_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ #endif
(h).hdr_padding[1] = 0; \ hdr->hdr_frag.lval = hdr_frag;
} while (0) hdr->hdr_size = hdr_size;
#else }
#define MCA_PML_OB1_FIN_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \ #define MCA_PML_OB1_FIN_HDR_NTOH(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_size = ntoh64((h).hdr_size); \
} while (0) } while (0)
#define MCA_PML_OB1_FIN_HDR_HTON(h) \ #define MCA_PML_OB1_FIN_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FIN_HDR_FILL(h); \ (h).hdr_size = hton64((h).hdr_size); \
} while (0) } while (0)
/** /**

Просмотреть файл

@ -94,12 +94,9 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
opal_convertor_get_packed_size (&convertor, &size); opal_convertor_get_packed_size (&convertor, &size);
} }
match.hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; comm->c_contextid, comm->c_my_rank,
match.hdr_ctx = comm->c_contextid; tag, seqn);
match.hdr_src = comm->c_my_rank;
match.hdr_tag = tag;
match.hdr_seq = seqn;
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc); ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);
@ -220,7 +217,7 @@ int mca_pml_ob1_send(void *buf,
OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t); OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t);
sendreq->req_send.req_base.req_proc = dst_proc; sendreq->req_send.req_base.req_proc = dst_proc;
sendreq->src_des = NULL; sendreq->rdma_frag = NULL;
MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
buf, buf,

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -9,6 +10,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -27,11 +30,6 @@
#include "pml_ob1.h" #include "pml_ob1.h"
#include "pml_ob1_rdma.h" #include "pml_ob1_rdma.h"
/* Use this registration if no registration needed for a BTL instead of NULL.
* This will help other code to distinguish case when memory is not registered
* from case when registration is not needed */
static mca_mpool_base_registration_t pml_ob1_dummy_reg;
/* /*
* Check to see if memory is registered or can be registered. Build a * Check to see if memory is registered or can be registered. Build a
* set of registrations on the request. * set of registrations on the request.
@ -45,7 +43,7 @@ size_t mca_pml_ob1_rdma_btls(
{ {
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0; double weight_total = 0;
int num_btls_used = 0, n; int num_btls_used = 0;
/* shortcut when there are no rdma capable btls */ /* shortcut when there are no rdma capable btls */
if(num_btls == 0) { if(num_btls == 0) {
@ -53,29 +51,25 @@ size_t mca_pml_ob1_rdma_btls(
} }
/* check to see if memory is registered */ /* check to see if memory is registered */
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) {
n++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
(bml_endpoint->btl_rdma_index + n) % num_btls); (bml_endpoint->btl_rdma_index + n) % num_btls);
mca_mpool_base_registration_t* reg = &pml_ob1_dummy_reg; mca_btl_base_registration_handle_t *reg_handle = NULL;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; mca_btl_base_module_t *btl = bml_btl->btl;
if( NULL != btl_mpool ) { if (btl->btl_register_mem) {
if(!mca_pml_ob1.leave_pinned) { /* try to register the memory with the btl */
/* look through existing registrations */ reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base,
btl_mpool->mpool_find(btl_mpool, base, size, &reg); size, MCA_BTL_REG_FLAG_REMOTE_READ);
} else { if (NULL == reg_handle) {
/* register the memory */ /* btl requires registration but the registration failed */
btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
}
if(NULL == reg)
continue; continue;
} }
} /* else no registration is needed */
rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg; rdma_btls[num_btls_used].btl_reg = reg_handle;
weight_total += bml_btl->btl_weight; weight_total += bml_btl->btl_weight;
num_btls_used++; num_btls_used++;
} }
@ -103,10 +97,6 @@ size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) { for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
rdma_btls[i].bml_btl = rdma_btls[i].bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
rdma_btls[i].btl_reg = NULL;
else
rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
weight_total += rdma_btls[i].bml_btl->btl_weight; weight_total += rdma_btls[i].bml_btl->btl_weight;
} }

Просмотреть файл

@ -32,18 +32,29 @@ typedef enum {
MCA_PML_OB1_RDMA_GET MCA_PML_OB1_RDMA_GET
} mca_pml_ob1_rdma_state_t; } mca_pml_ob1_rdma_state_t;
struct mca_pml_ob1_rdma_frag_t;
typedef void (*mca_pml_ob1_rdma_frag_callback_t)(struct mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length);
/**
* Used to keep track of local and remote RDMA operations.
*/
struct mca_pml_ob1_rdma_frag_t { struct mca_pml_ob1_rdma_frag_t {
ompi_free_list_item_t super; ompi_free_list_item_t super;
mca_bml_base_btl_t *rdma_bml; mca_bml_base_btl_t *rdma_bml;
mca_pml_ob1_hdr_t rdma_hdr; mca_pml_ob1_hdr_t rdma_hdr;
mca_pml_ob1_rdma_state_t rdma_state; mca_pml_ob1_rdma_state_t rdma_state;
size_t rdma_length; size_t rdma_length;
uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS];
void *rdma_req; void *rdma_req;
struct mca_bml_base_endpoint_t* rdma_ep;
opal_convertor_t convertor;
mca_mpool_base_registration_t* reg;
uint32_t retries; uint32_t retries;
mca_pml_ob1_rdma_frag_callback_t cbfunc;
uint64_t rdma_offset;
void *local_address;
mca_btl_base_registration_handle_t *local_handle;
uint64_t remote_address;
uint8_t remote_handle[MCA_BTL_REG_HANDLE_MAX_SIZE];
}; };
typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t; typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t;
@ -60,11 +71,14 @@ do { \
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \ #define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
do { \ do { \
/* return fragment */ \ /* return fragment */ \
if (frag->local_handle) { \
mca_bml_base_deregister_mem (frag->rdma_bml, frag->local_handle); \
frag->local_handle = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \ OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
(ompi_free_list_item_t*)frag); \ (ompi_free_list_item_t*)frag); \
} while (0) } while (0)
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -295,6 +295,7 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_local;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_send_request_t* sendreq; mca_pml_ob1_send_request_t* sendreq;
size_t size;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return; return;
@ -307,19 +308,25 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
/* if the request should be delivered entirely by copy in/out /* if the request should be delivered entirely by copy in/out
* then throttle sends */ * then throttle sends */
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) { if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
if (NULL != sendreq->src_des) { if (NULL != sendreq->rdma_frag) {
/* release registered memory */ if (NULL != sendreq->rdma_frag->local_handle) {
mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des); mca_bml_base_deregister_mem (sendreq->req_rdma[0].bml_btl, sendreq->rdma_frag->local_handle);
sendreq->src_des = NULL; sendreq->rdma_frag->local_handle = NULL;
}
MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
sendreq->rdma_frag = NULL;
} }
sendreq->req_throttle_sends = true; sendreq->req_throttle_sends = true;
} }
mca_pml_ob1_send_request_copy_in_out(sendreq, if (hdr->hdr_ack.hdr_send_size) {
hdr->hdr_ack.hdr_send_offset, size = hdr->hdr_ack.hdr_send_size;
sendreq->req_send.req_bytes_packed - } else {
hdr->hdr_ack.hdr_send_offset); size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset;
}
mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size);
if (sendreq->req_state != 0) { if (sendreq->req_state != 0) {
/* Typical receipt of an ACK message causes req_state to be /* Typical receipt of an ACK message causes req_state to be
@ -362,6 +369,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return; return;
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
@ -380,6 +388,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
return; return;
} }
#endif /* OPAL_CUDA_SUPPORT */ #endif /* OPAL_CUDA_SUPPORT */
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count); mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
return; return;
@ -411,19 +420,16 @@ void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_local;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval;
mca_btl_base_descriptor_t* rdma; mca_pml_ob1_rdma_frag_t *frag;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) {
return; return;
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN);
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval; frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval;
rdma->des_cbfunc(btl, NULL, rdma, frag->cbfunc (frag, hdr->hdr_size);
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
return;
} }

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2012 FUJITSU LIMITED. All rights reserved. * Copyright (c) 2012 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
@ -183,31 +183,27 @@ static void mca_pml_ob1_recv_ctl_completion( mca_btl_base_module_t* btl,
* Put operation has completed remotely - update request status * Put operation has completed remotely - update request status
*/ */
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_size)
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{ {
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
size_t bytes_received = 0;
assert (rdma_size == frag->rdma_length);
if( OPAL_LIKELY(status == OMPI_SUCCESS) ) {
bytes_received = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count, 0);
}
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
mca_bml_base_free(bml_btl, des); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
if (OPAL_LIKELY(0 < rdma_size)) {
/* check completion status */ /* check completion status */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size);
if (recv_request_pml_complete_check(recvreq) == false && if (recv_request_pml_complete_check(recvreq) == false &&
recvreq->req_rdma_offset < recvreq->req_send_offset) { recvreq->req_rdma_offset < recvreq->req_send_offset) {
/* schedule additional rdma operations */ /* schedule additional rdma operations */
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl); mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
} }
}
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
@ -218,7 +214,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_recv_request_ack_send_btl( int mca_pml_ob1_recv_request_ack_send_btl(
ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma) size_t size, bool nordma)
{ {
mca_btl_base_descriptor_t* des; mca_btl_base_descriptor_t* des;
mca_pml_ob1_ack_hdr_t* ack; mca_pml_ob1_ack_hdr_t* ack;
@ -234,11 +230,8 @@ int mca_pml_ob1_recv_request_ack_send_btl(
/* fill out header */ /* fill out header */
ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval; ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval;
ack->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK; mca_pml_ob1_ack_hdr_prepare (ack, nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0,
ack->hdr_common.hdr_flags = nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0; hdr_src_req, hdr_dst_req, hdr_send_offset, size);
ack->hdr_src_req.lval = hdr_src_req;
ack->hdr_dst_req.pval = hdr_dst_req;
ack->hdr_send_offset = hdr_send_offset;
ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc); ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc);
@ -312,63 +305,99 @@ static int mca_pml_ob1_recv_request_ack(
if(recvreq->req_send_offset == hdr->hdr_msg_length) if(recvreq->req_send_offset == hdr->hdr_msg_length)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* let know to shedule function there is no need to put ACK flag */ /* let know to shedule function there is no need to put ACK flag */
recvreq->req_ack_sent = true; recvreq->req_ack_sent = true;
return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval, return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval,
recvreq, recvreq->req_send_offset, recvreq, recvreq->req_send_offset, 0,
recvreq->req_send_offset == bytes_received); recvreq->req_send_offset == bytes_received);
} }
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag);
static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
if (OMPI_ERR_NOT_AVAILABLE == rc) {
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_recv_request_put_frag (frag);
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append (&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
}
if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
/* tell peer to fall back on send for this region */
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, frag->rdma_offset, frag->rdma_length, false);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
}
/** /**
* Return resources used by the RDMA * Return resources used by the RDMA
*/ */
static void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_endpoint_t* ep, void *local_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des, void *context, void *cbdata, int status)
int status )
{ {
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
/* check completion status */ /* check completion status */
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
status = mca_pml_ob1_recv_request_get_frag_failed (frag, status);
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
/* TSW - FIX */ /* TSW - FIX */
OMPI_ERROR_LOG(status); OMPI_ERROR_LOG(status);
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
} }
} else {
/* is receive request complete */ /* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
if (recvreq->req_recv.req_bytes_packed <= recvreq->req_bytes_received) { /* TODO: re-add order */
mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc, mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc,
bml_btl, bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag,
frag->rdma_hdr.hdr_rget.hdr_des, frag->rdma_length, 0, 0);
des->order, 0);
}
recv_request_pml_complete_check(recvreq); recv_request_pml_complete_check(recvreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
}
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag)
mca_btl_base_descriptor_t *dst) { {
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *ctl; mca_btl_base_descriptor_t *ctl;
mca_pml_ob1_rdma_hdr_t *hdr; mca_pml_ob1_rdma_hdr_t *hdr;
size_t seg_size; size_t reg_size;
int rc; int rc;
seg_size = bml_btl->btl->btl_seg_size * dst->des_local_count; reg_size = bml_btl->btl->btl_registration_handle_size;
/* prepare a descriptor for rdma control message */ /* prepare a descriptor for rdma control message */
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size, mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_SEND_ALWAYS_CALLBACK); MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if (OPAL_UNLIKELY(NULL == ctl)) { if (OPAL_UNLIKELY(NULL == ctl)) {
@ -378,26 +407,19 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
/* fill in rdma header */ /* fill in rdma header */
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval; hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; mca_pml_ob1_rdma_hdr_prepare (hdr, (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0,
hdr->hdr_common.hdr_flags = recvreq->remote_req_send.lval, frag, recvreq, frag->rdma_offset,
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0; frag->local_address, frag->rdma_length, frag->local_handle,
reg_size);
hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req; frag->cbfunc = mca_pml_ob1_put_completion;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_des.pval = dst;
hdr->hdr_recv_req.pval = recvreq;
hdr->hdr_seg_cnt = dst->des_local_count;
/* copy segments */
memcpy (hdr + 1, dst->des_local, seg_size);
dst->des_cbfunc = mca_pml_ob1_put_completion;
dst->des_cbdata = recvreq;
if (!recvreq->req_ack_sent)
recvreq->req_ack_sent = true; recvreq->req_ack_sent = true;
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), size,
PERUSE_RECV);
/* send rdma request to peer */ /* send rdma request to peer */
rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
if (OPAL_UNLIKELY(rc < 0)) { if (OPAL_UNLIKELY(rc < 0)) {
@ -413,69 +435,28 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
*/ */
int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag) int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag)
{ {
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t* descriptor;
size_t save_size = frag->rdma_length;
int rc; int rc;
/* prepare descriptor */ /* prepare descriptor */
mca_bml_base_prepare_dst( bml_btl, if (bml_btl->btl->btl_register_mem && !frag->local_handle) {
NULL, mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, MCA_BTL_REG_FLAG_LOCAL_WRITE |
&recvreq->req_recv.req_base.req_convertor, MCA_BTL_REG_FLAG_REMOTE_WRITE, &frag->local_handle);
MCA_BTL_NO_ORDER, if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
0, return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
&frag->rdma_length,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
MCA_BTL_DES_FLAGS_GET,
&descriptor );
if( OPAL_UNLIKELY(NULL == descriptor) ) {
if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
frag->rdma_length = save_size;
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
/* tell peer to fall back on send */
recvreq->req_send_offset = 0;
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, recvreq->req_send_offset, true);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
} }
} }
descriptor->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
descriptor->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
descriptor->des_cbfunc = mca_pml_ob1_rget_completion;
descriptor->des_cbdata = frag;
PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE, PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), &(((mca_pml_ob1_recv_request_t *) frag->rdma_req)->req_recv.req_base),
frag->rdma_length, PERUSE_RECV); frag->rdma_length, PERUSE_RECV);
/* queue up get request */ /* queue up get request */
rc = mca_bml_base_get(bml_btl,descriptor); rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, frag->local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, mca_pml_ob1_rget_completion, frag);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) { return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
}
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
mca_bml_base_free(bml_btl, descriptor);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending,
(opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -501,6 +482,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments, bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
sizeof(mca_pml_ob1_frag_hdr_t)); sizeof(mca_pml_ob1_frag_hdr_t));
data_offset = hdr->hdr_frag.hdr_frag_offset; data_offset = hdr->hdr_frag.hdr_frag_offset;
/* /*
* Make user buffer accessible(defined) before unpacking. * Make user buffer accessible(defined) before unpacking.
*/ */
@ -628,7 +610,6 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval;
mca_bml_base_endpoint_t* bml_endpoint = NULL; mca_bml_base_endpoint_t* bml_endpoint = NULL;
size_t bytes_remaining, prev_sent, offset; size_t bytes_remaining, prev_sent, offset;
mca_btl_base_segment_t *r_segments;
mca_pml_ob1_rdma_frag_t *frag; mca_pml_ob1_rdma_frag_t *frag;
mca_bml_base_btl_t *rdma_bml; mca_bml_base_btl_t *rdma_bml;
int rc; int rc;
@ -636,6 +617,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
prev_sent = offset = 0; prev_sent = offset = 0;
bytes_remaining = hdr->hdr_rndv.hdr_msg_length; bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_send_offset = 0;
MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match); MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
@ -679,8 +661,10 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
} }
bytes_remaining = mca_pml_ob1_compute_segment_length_remote (btl->btl_seg_size, (void *)(hdr + 1), bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
hdr->hdr_seg_cnt, recvreq->req_recv.req_base.req_proc);
/* save the request for put fallback */
recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
/* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num /* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num
* of bytes left to be send. In each iteration we send the max possible bytes supported * of bytes left to be send. In each iteration we send the max possible bytes supported
@ -689,7 +673,12 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
* the next iteration with the updated size. * the next iteration with the updated size.
* Also - In each iteration we update the location in the buffer to be used for writing * Also - In each iteration we update the location in the buffer to be used for writing
* the message ,and the location to read from. This is done using the offset variable that * the message ,and the location to read from. This is done using the offset variable that
* accumulates the number of bytes that were sent so far. */ * accumulates the number of bytes that were sent so far.
*
* NTH: This fragmentation may go away if we change the btls to require them to handle
* get fragmentation internally. This is a reasonable solution since some btls do not
* need any fragmentation (sm, vader, self, etc). Remove this loop if this ends up
* being the case. */
while (bytes_remaining > 0) { while (bytes_remaining > 0) {
/* allocate/initialize a fragment */ /* allocate/initialize a fragment */
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
@ -699,18 +688,15 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
} }
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs)); memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt); /* update the read location */
frag->remote_address = hdr->hdr_src_ptr + offset;
/* update the read location -- NTH: note this will only work if there is exactly one
segment. TODO -- make this work with multiple segments */
r_segments = (mca_btl_base_segment_t *) frag->rdma_segs;
r_segments->seg_addr.lval += offset;
/* updating the write location */ /* updating the write location */
OPAL_THREAD_LOCK(&recvreq->lock); OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset); opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address);
OPAL_THREAD_UNLOCK(&recvreq->lock); OPAL_THREAD_UNLOCK(&recvreq->lock);
frag->rdma_bml = rdma_bml; frag->rdma_bml = rdma_bml;
@ -718,10 +704,15 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
frag->rdma_hdr.hdr_rget = *hdr; frag->rdma_hdr.hdr_rget = *hdr;
frag->retries = 0; frag->retries = 0;
frag->rdma_req = recvreq; frag->rdma_req = recvreq;
frag->rdma_ep = bml_endpoint;
frag->rdma_state = MCA_PML_OB1_RDMA_GET; frag->rdma_state = MCA_PML_OB1_RDMA_GET;
frag->reg = NULL; frag->local_handle = NULL;
frag->rdma_offset = offset;
if (bytes_remaining > rdma_bml->btl->btl_get_limit) {
frag->rdma_length = rdma_bml->btl->btl_get_limit;
} else {
frag->rdma_length = bytes_remaining; frag->rdma_length = bytes_remaining;
}
/* NTH: TODO -- handle error conditions gracefully */ /* NTH: TODO -- handle error conditions gracefully */
rc = mca_pml_ob1_recv_request_get_frag(frag); rc = mca_pml_ob1_recv_request_get_frag(frag);
@ -920,13 +911,11 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
while(bytes_remaining > 0 && while(bytes_remaining > 0 &&
recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) { recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
size_t size, seg_size; mca_pml_ob1_rdma_frag_t *frag = NULL;
mca_pml_ob1_rdma_hdr_t* hdr;
mca_btl_base_descriptor_t* dst;
mca_btl_base_descriptor_t* ctl;
mca_mpool_base_registration_t * reg = NULL;
mca_btl_base_module_t *btl; mca_btl_base_module_t *btl;
int rc, rdma_idx; int rc, rdma_idx;
void *data_ptr;
size_t size;
if(prev_bytes_remaining == bytes_remaining) { if(prev_bytes_remaining == bytes_remaining) {
if(++num_fail == num_tries) { if(++num_fail == num_tries) {
@ -947,85 +936,62 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
do { do {
rdma_idx = recvreq->req_rdma_idx; rdma_idx = recvreq->req_rdma_idx;
bml_btl = recvreq->req_rdma[rdma_idx].bml_btl; bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
reg = recvreq->req_rdma[rdma_idx].btl_reg;
size = recvreq->req_rdma[rdma_idx].length; size = recvreq->req_rdma[rdma_idx].length;
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
recvreq->req_rdma_idx = 0; recvreq->req_rdma_idx = 0;
} while(!size); } while(!size);
btl = bml_btl->btl; btl = bml_btl->btl;
/* makes sure that we don't exceed BTL max rdma size /* NTH: This conditional used to check if there was a registration in
* if memory is not pinned already */ * recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to
if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) && * the btl not needed registration (equivalent to btl->btl_register_mem
* != NULL. This new check is equivalent. Note: I feel this protocol
* needs work to better improve resource usage when running with a
* leave pinned protocol. */
if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) &&
(size > btl->btl_rdma_pipeline_frag_size)) { (size > btl->btl_rdma_pipeline_frag_size)) {
size = btl->btl_rdma_pipeline_frag_size; size = btl->btl_rdma_pipeline_frag_size;
} }
/* take lock to protect converter against concurrent access MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
continue;
}
/* take lock to protect convertor against concurrent access
* from unpack */ * from unpack */
OPAL_THREAD_LOCK(&recvreq->lock); OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor, opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor,
&recvreq->req_rdma_offset); &recvreq->req_rdma_offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
/* prepare a descriptor for RDMA */
mca_bml_base_prepare_dst(bml_btl, reg,
&recvreq->req_recv.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_PUT, &dst);
OPAL_THREAD_UNLOCK(&recvreq->lock); OPAL_THREAD_UNLOCK(&recvreq->lock);
if(OPAL_UNLIKELY(dst == NULL)) { if (btl->btl_register_mem) {
mca_bml_base_register_mem (bml_btl, data_ptr, size, MCA_BTL_REG_FLAG_REMOTE_WRITE,
&frag->local_handle);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
continue; continue;
} }
dst->des_cbfunc = mca_pml_ob1_put_completion;
dst->des_cbdata = recvreq;
seg_size = btl->btl_seg_size * dst->des_local_count;
/* prepare a descriptor for rdma control message */
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if( OPAL_UNLIKELY(NULL == ctl) ) {
mca_bml_base_free(bml_btl,dst);
continue;
} }
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
/* fill in rdma header */ /* fill in the minimum information needed to handle the fin message */
hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_local->seg_addr.pval; frag->cbfunc = mca_pml_ob1_put_completion;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; frag->rdma_length = size;
hdr->hdr_common.hdr_flags = frag->rdma_req = recvreq;
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0; frag->rdma_bml = bml_btl;
hdr->hdr_req = recvreq->remote_req_send; frag->local_address = data_ptr;
hdr->hdr_des.pval = dst; frag->rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_recv_req.pval = recvreq;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_seg_cnt = dst->des_local_count;
/* copy segments */ rc = mca_pml_ob1_recv_request_put_frag (frag);
memmove (hdr + 1, dst->des_local, seg_size); if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
if(!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc);
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), size,
PERUSE_RECV);
/* send rdma request to peer */
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
if( OPAL_LIKELY( rc >= 0 ) ) {
/* update request state */ /* update request state */
recvreq->req_rdma_offset += size; recvreq->req_rdma_offset += size;
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
recvreq->req_rdma[rdma_idx].length -= size; recvreq->req_rdma[rdma_idx].length -= size;
bytes_remaining -= size; bytes_remaining -= size;
} else { } else {
mca_bml_base_free(bml_btl,ctl); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
mca_bml_base_free(bml_btl,dst);
} }
} }

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
@ -154,9 +155,11 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
} }
for(i = 0; i < recvreq->req_rdma_cnt; i++) { for(i = 0; i < recvreq->req_rdma_cnt; i++) {
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg; struct mca_btl_base_registration_handle_t *handle = recvreq->req_rdma[i].btl_reg;
if( NULL != btl_reg && btl_reg->mpool != NULL) { mca_bml_base_btl_t *bml_btl = recvreq->req_rdma[i].bml_btl;
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
} }
} }
recvreq->req_rdma_cnt = 0; recvreq->req_rdma_cnt = 0;
@ -387,7 +390,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
(void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl); (void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
} }
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \ #define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \
do { \ do { \
mca_pml_ob1_pckt_pending_t *_pckt; \ mca_pml_ob1_pckt_pending_t *_pckt; \
\ \
@ -396,6 +399,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \ _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \ _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \ _pckt->hdr.hdr_ack.hdr_send_offset = (O); \
_pckt->hdr.hdr_ack.hdr_send_size = (Sz); \
_pckt->proc = (P); \ _pckt->proc = (P); \
_pckt->bml_btl = NULL; \ _pckt->bml_btl = NULL; \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
@ -406,11 +410,11 @@ static inline void mca_pml_ob1_recv_request_schedule(
int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc, int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_rdma_offset, bool nordma); uint64_t hdr_rdma_offset, uint64_t size, bool nordma);
static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma) uint64_t size, bool nordma)
{ {
size_t i; size_t i;
mca_bml_base_btl_t* bml_btl; mca_bml_base_btl_t* bml_btl;
@ -420,12 +424,12 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS) hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req, MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
hdr_send_offset); hdr_send_offset, size);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }

Просмотреть файл

@ -250,27 +250,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
*/ */
static void static void
mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length)
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{ {
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
size_t req_bytes_delivered;
/* count bytes of user data actually delivered and check for request completion */ /* count bytes of user data actually delivered and check for request completion */
if (OPAL_LIKELY(OMPI_SUCCESS == status)) { if (OPAL_LIKELY(0 < rdma_length)) {
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length);
(void *) des->des_local,
des->des_local_count, 0);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
} }
sendreq->src_des = NULL;
send_request_pml_complete_check(sendreq); send_request_pml_complete_check(sendreq);
/* free the descriptor */
mca_bml_base_free(bml_btl, des);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
@ -407,17 +398,14 @@ int mca_pml_ob1_send_request_start_buffered(
/* build rendezvous header */ /* build rendezvous header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; sendreq->req_send.req_bytes_packed, sendreq);
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* update lengths */ /* update lengths */
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data; segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
@ -490,15 +478,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
if(NULL != bml_btl->btl->btl_sendi) { if(NULL != bml_btl->btl->btl_sendi) {
mca_pml_ob1_match_hdr_t match; mca_pml_ob1_match_hdr_t match;
match.hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; sendreq->req_send.req_base.req_comm->c_contextid,
match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence);
match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* try to send immediately */ /* try to send immediately */
rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor, rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
@ -565,15 +551,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
/* build match header */ /* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence);
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* update lengths */ /* update lengths */
segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data; segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data;
@ -631,15 +615,13 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
/* build match header */ /* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence);
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* short message */ /* short message */
des->des_cbfunc = mca_pml_ob1_match_completion_free; des->des_cbfunc = mca_pml_ob1_match_completion_free;
@ -673,79 +655,67 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
* one RDMA capable BTLs). This way round robin distribution of RDMA * one RDMA capable BTLs). This way round robin distribution of RDMA
* operation is achieved. * operation is achieved.
*/ */
mca_btl_base_registration_handle_t *local_handle;
mca_btl_base_descriptor_t *des, *src = NULL; mca_btl_base_descriptor_t *des;
mca_pml_ob1_rdma_frag_t *frag;
mca_pml_ob1_rget_hdr_t *hdr; mca_pml_ob1_rget_hdr_t *hdr;
size_t seg_size; size_t reg_size;
void *data_ptr;
int rc; int rc;
sendreq->src_des = NULL;
bml_btl = sendreq->req_rdma[0].bml_btl; bml_btl = sendreq->req_rdma[0].bml_btl;
if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) { if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
sendreq->rdma_frag = NULL;
/* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */ /* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */
return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG | return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG |
MCA_PML_OB1_HDR_FLAGS_PIN); MCA_PML_OB1_HDR_FLAGS_PIN);
} }
MEMCHECKER( /* at this time ob1 does not support non-contiguous gets. the convertor represents a
memchecker_call(&opal_memchecker_base_mem_defined, * contiguous block of memory */
sendreq->req_send.req_base.req_addr, opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype); local_handle = sendreq->req_rdma[0].btl_reg;
);
/* prepare source descriptor/segment(s) */ /* allocate an rdma fragment to keep track of the request size for use in the fin message */
/* PML owns this descriptor and will free it in */ MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
/* mca_pml_ob1_rget_completion */ if (OPAL_UNLIKELY(NULL == frag)) {
mca_bml_base_prepare_src( bml_btl, sendreq->req_rdma[0].btl_reg, return OPAL_ERR_OUT_OF_RESOURCE;
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_GET |
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &src );
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
sendreq->req_send.req_base.req_addr,
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype);
);
if( OPAL_UNLIKELY(NULL == src) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
} }
src->des_cbfunc = mca_pml_ob1_rget_completion; /* fill in necessary fragment data */
src->des_cbdata = sendreq; frag->rdma_req = sendreq;
frag->rdma_bml = bml_btl;
frag->rdma_length = size;
frag->cbfunc = mca_pml_ob1_rget_completion;
/* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */
sendreq->src_des = src; /* save the fragment for get->put fallback */
sendreq->rdma_frag = frag;
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count; reg_size = bml_btl->btl->btl_registration_handle_size;
/* allocate space for get hdr + segment list */ /* allocate space for get hdr + segment list */
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size, mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
/* NTH: no need to reset the converter here. it will be reset before it is retried */ /* NTH: no need to reset the converter here. it will be reset before it is retried */
mca_bml_base_free(bml_btl, src);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
/* build match header */ /* build match header */
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval; hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval;
/* TODO -- Add support for multiple segments for get */
hdr->hdr_rndv.hdr_match.hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN; mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN,
hdr->hdr_rndv.hdr_match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_rndv.hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_rndv.hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_rndv.hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_rndv.hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; sendreq->req_send.req_bytes_packed, sendreq,
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; frag, data_ptr, local_handle, reg_size);
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
hdr->hdr_des.pval = src;
hdr->hdr_seg_cnt = src->des_local_count;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc);
/* copy segment data */
memcpy (hdr + 1, src->des_local, seg_size);
des->des_cbfunc = mca_pml_ob1_send_ctl_completion; des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
des->des_cbdata = sendreq; des->des_cbdata = sendreq;
@ -763,12 +733,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET); rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET);
if (OPAL_UNLIKELY(rc < 0)) { if (OPAL_UNLIKELY(rc < 0)) {
mca_bml_base_free(bml_btl, des); mca_bml_base_free(bml_btl, des);
if (sendreq->src_des) {
mca_bml_base_free (bml_btl, sendreq->src_des);
sendreq->src_des = NULL;
}
return rc; return rc;
} }
@ -828,17 +792,14 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
/* build hdr */ /* build hdr */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = flags; mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; sendreq->req_send.req_bytes_packed, sendreq);
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* first fragment of a long message */ /* first fragment of a long message */
des->des_cbdata = sendreq; des->des_cbdata = sendreq;
@ -1047,11 +1008,8 @@ cannot_pack:
/* setup header */ /* setup header */
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval; hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG; sendreq->req_recv.lval);
hdr->hdr_frag_offset = range->range_send_offset;
hdr->hdr_src_req.pval = sendreq;
hdr->hdr_dst_req = sendreq->req_recv;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG,
sendreq->req_send.req_base.req_proc); sendreq->req_send.req_base.req_proc);
@ -1108,37 +1066,65 @@ cannot_pack:
} }
/**
* A put fragment could not be started. Queue the fragment to be retried later or
* fall back on send/recv.
*/
static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
/* queue the frag for later if there was a resource error */
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
/* tell receiver to deregister memory */
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_frag,
0, MCA_BTL_NO_ORDER, 1);
/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if (NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule (sendreq);
}
}
/** /**
* An RDMA put operation has completed: * An RDMA put operation has completed:
* (1) Update request status and if required set completed * (1) Update request status and if required set completed
* (2) Send FIN control message to the destination * (2) Send FIN control message to the destination
*/ */
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_endpoint_t* ep, void *local_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des, void *context, void *cbdata, int status)
int status )
{ {
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) {
/* TSW - FIX */ /* TODO -- readd ordering */
OMPI_ERROR_LOG(status); mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
ompi_rte_abort(-1, NULL); frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length,
} 0, 0);
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_des,
des->order, 0);
/* check for request completion */ /* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
send_request_pml_complete_check(sendreq); send_request_pml_complete_check(sendreq);
} else {
/* try to fall back on send/recv */
mca_pml_ob1_send_request_put_frag_failed (frag, status);
}
MCA_PML_OB1_RDMA_FRAG_RETURN(frag); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
@ -1148,80 +1134,44 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag ) int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
{ {
mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_mpool_base_registration_t *reg = NULL; mca_btl_base_registration_handle_t *local_handle = NULL;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *des;
size_t save_size = frag->rdma_length;
int rc; int rc;
if (OPAL_LIKELY(NULL == sendreq->src_des)) { if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) {
/* setup descriptor */ /* Check if the segment is already registered */
mca_bml_base_prepare_src( bml_btl, for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) {
reg, if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
&frag->convertor, /* do not copy the handle to the fragment to avoid deregistring it twice */
MCA_BTL_NO_ORDER, local_handle = sendreq->req_rdma[i].btl_reg;
0, break;
&frag->rdma_length,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_PUT,
&des );
if( OPAL_UNLIKELY(NULL == des) ) {
if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
frag->rdma_length = save_size;
opal_convertor_set_position(&frag->convertor, &offset);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
mca_pml_ob1_send_request_t *sendreq =
(mca_pml_ob1_send_request_t*)frag->rdma_req;
/* tell receiver to unregister memory */
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
MCA_BTL_NO_ORDER, 1);
/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq,
frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if(NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule(sendreq);
} }
}
if (NULL == frag->local_handle) {
/* Not already registered. Register the region with the BTL. */
mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0,
&frag->local_handle);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
} else {
/* already have a source descriptor */
des = sendreq->src_des;
sendreq->src_des = NULL;
}
des->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs; local_handle = frag->local_handle;
des->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; }
des->des_cbfunc = mca_pml_ob1_put_completion; }
des->des_cbdata = frag;
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND ); &(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
rc = mca_bml_base_put(bml_btl, des); rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, mca_pml_ob1_put_completion, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
mca_bml_base_free(bml_btl, des); mca_pml_ob1_send_request_put_frag_failed (frag, rc);
frag->rdma_length = save_size; return rc;
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
/* TSW - FIX */
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -1240,7 +1190,6 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
{ {
mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint; mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
mca_pml_ob1_rdma_frag_t* frag; mca_pml_ob1_rdma_frag_t* frag;
size_t i, size = 0;
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) { if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) {
OPAL_THREAD_ADD32(&sendreq->req_state, -1); OPAL_THREAD_ADD32(&sendreq->req_state, -1);
@ -1248,6 +1197,7 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_recv.pval = hdr->hdr_recv_req.pval; sendreq->req_recv.pval = hdr->hdr_recv_req.pval;
if (NULL == sendreq->rdma_frag) {
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if( OPAL_UNLIKELY(NULL == frag) ) { if( OPAL_UNLIKELY(NULL == frag) ) {
@ -1255,54 +1205,28 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
} }
} else {
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs)); /* rget fallback on put */
frag = sendreq->rdma_frag;
/* setup fragment */ sendreq->rdma_frag = NULL;
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt); sendreq->req_state = 0;
for( i = 0; i < hdr->hdr_seg_cnt; i++ ) {
mca_btl_base_segment_t *seg = (mca_btl_base_segment_t *) ((uintptr_t)(frag->rdma_segs) + i * btl->btl_seg_size);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if ((sendreq->req_send.req_base.req_proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
(ompi_proc_local()->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
size += opal_swap_bytes4(seg->seg_len);
} else
#endif
{
size += seg->seg_len;
}
} }
/* copy registration data */
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
frag->rdma_hdr.hdr_rdma = *hdr; frag->rdma_hdr.hdr_rdma = *hdr;
frag->rdma_req = sendreq; frag->rdma_req = sendreq;
frag->rdma_ep = bml_endpoint; frag->rdma_length = hdr->hdr_dst_size;
frag->rdma_length = size;
frag->rdma_state = MCA_PML_OB1_RDMA_PUT; frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
frag->reg = NULL; frag->remote_address = hdr->hdr_dst_ptr;
frag->retries = 0; frag->retries = 0;
if (OPAL_UNLIKELY(NULL != sendreq->src_des)) { /* Get the address of the current offset. Note: at this time ob1 CAN NOT handle
/* get fallback path */ * non-contiguous RDMA. If that changes this code will be wrong. */
sendreq->req_state = 0; opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor,
} hdr->hdr_rdma_offset, &frag->local_address);
/* lookup the corresponding registration */
for(i=0; i<sendreq->req_rdma_cnt; i++) {
if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
frag->reg = sendreq->req_rdma[i].btl_reg;
break;
}
}
/* RDMA writes may proceed in parallel to send and to each other, so
* create clone of the convertor for each RDMA fragment
*/
size = hdr->hdr_rdma_offset;
opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
&frag->convertor, 0, &size);
mca_pml_ob1_send_request_put_frag(frag); mca_pml_ob1_send_request_put_frag(frag);
} }

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved. * Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -54,7 +54,7 @@ struct mca_pml_ob1_send_request_t {
mca_pml_ob1_send_pending_t req_pending; mca_pml_ob1_send_pending_t req_pending;
opal_mutex_t req_send_range_lock; opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges; opal_list_t req_send_ranges;
mca_btl_base_descriptor_t *src_des; mca_pml_ob1_rdma_frag_t *rdma_frag;
mca_pml_ob1_com_btl_t req_rdma[1]; mca_pml_ob1_com_btl_t req_rdma[1];
}; };
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
@ -127,7 +127,6 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \ OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
sendreq = (mca_pml_ob1_send_request_t*)item; \ sendreq = (mca_pml_ob1_send_request_t*)item; \
sendreq->req_send.req_base.req_proc = proc; \ sendreq->req_send.req_base.req_proc = proc; \
sendreq->src_des = NULL; \
} \ } \
} }
@ -169,9 +168,12 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s
/* return mpool resources */ /* return mpool resources */
for(r = 0; r < sendreq->req_rdma_cnt; r++) { for(r = 0; r < sendreq->req_rdma_cnt; r++) {
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg;
if( NULL != reg && reg->mpool != NULL ) { mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
reg->mpool->mpool_deregister(reg->mpool, reg);
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
sendreq->req_rdma[r].btl_reg = NULL;
} }
} }
sendreq->req_rdma_cnt = 0; sendreq->req_rdma_cnt = 0;
@ -220,6 +222,10 @@ do {
do { \ do { \
/* Let the base handle the reference counts */ \ /* Let the base handle the reference counts */ \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
if (sendreq->rdma_frag) { \
MCA_PML_OB1_RDMA_FRAG_RETURN (sendreq->rdma_frag); \
sendreq->rdma_frag = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \ OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
(ompi_free_list_item_t*)sendreq); \ (ompi_free_list_item_t*)sendreq); \
} while(0) } while(0)

Просмотреть файл

@ -1,4 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -217,6 +217,14 @@ static inline void opal_convertor_get_current_pointer( const opal_convertor_t* p
*position = (void*)base; *position = (void*)base;
} }
static inline void opal_convertor_get_offset_pointer( const opal_convertor_t* pConv,
size_t offset, void** position )
{
unsigned char* base = pConv->pBaseBuf + offset + pConv->pDesc->true_lb;
*position = (void*)base;
}
/* /*
* *
*/ */