1
1

pml/ob1: update for BTL 3.0 interface

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2015-01-06 08:45:08 -07:00 коммит произвёл Nathan Hjelm
родитель 44fb8369ff
Коммит c4a0e02261
13 изменённых файлов: 715 добавлений и 749 удалений

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -500,17 +500,17 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
case MCA_PML_OB1_HDR_TYPE_RGET: case MCA_PML_OB1_HDR_TYPE_RGET:
type = "RGET"; type = "RGET";
snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64 snprintf( header, 128, "ctx %5d src %d tag %d seq %d msg_length %" PRIu64
"seg_cnt %d hdr_des %" PRIu64, "frag %" PRIu64 " src_ptr %" PRIu64,
hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src, hdr->hdr_rndv.hdr_match.hdr_ctx, hdr->hdr_rndv.hdr_match.hdr_src,
hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq, hdr->hdr_rndv.hdr_match.hdr_tag, hdr->hdr_rndv.hdr_match.hdr_seq,
hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rndv.hdr_msg_length, hdr->hdr_rget.hdr_frag.lval,
hdr->hdr_rget.hdr_seg_cnt, hdr->hdr_rget.hdr_des.lval); hdr->hdr_rget.hdr_src_ptr);
break; break;
case MCA_PML_OB1_HDR_TYPE_ACK: case MCA_PML_OB1_HDR_TYPE_ACK:
type = "ACK"; type = "ACK";
snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64, snprintf( header, 128, "src_req %p dst_req %p offset %" PRIu64 " size %" PRIu64,
hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval, hdr->hdr_ack.hdr_src_req.pval, hdr->hdr_ack.hdr_dst_req.pval,
hdr->hdr_ack.hdr_send_offset); hdr->hdr_ack.hdr_send_offset, hdr->hdr_ack.hdr_send_size);
break; break;
case MCA_PML_OB1_HDR_TYPE_FRAG: case MCA_PML_OB1_HDR_TYPE_FRAG:
type = "FRAG"; type = "FRAG";
@ -520,10 +520,11 @@ static void mca_pml_ob1_dump_hdr(mca_pml_ob1_hdr_t* hdr)
break; break;
case MCA_PML_OB1_HDR_TYPE_PUT: case MCA_PML_OB1_HDR_TYPE_PUT:
type = "PUT"; type = "PUT";
snprintf( header, 128, "seg_cnt %d dst_req %p src_des %p recv_req %p offset %" PRIu64 " [%p %" PRIu64 "]", snprintf( header, 128, "dst_req %p src_frag %p recv_req %p offset %" PRIu64
hdr->hdr_rdma.hdr_seg_cnt, hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_des.pval, " dst_ptr %" PRIu64 " dst_size %" PRIu64,
hdr->hdr_rdma.hdr_req.pval, hdr->hdr_rdma.hdr_frag.pval,
hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset, hdr->hdr_rdma.hdr_recv_req.pval, hdr->hdr_rdma.hdr_rdma_offset,
hdr->hdr_rdma.hdr_segs[0].seg_addr.pval, hdr->hdr_rdma.hdr_segs[0].seg_len); hdr->hdr_rdma.hdr_dst_ptr, hdr->hdr_rdma.hdr_dst_size);
break; break;
case MCA_PML_OB1_HDR_TYPE_FIN: case MCA_PML_OB1_HDR_TYPE_FIN:
type = "FIN"; type = "FIN";
@ -638,37 +639,32 @@ static void mca_pml_ob1_fin_completion( mca_btl_base_module_t* btl,
*/ */
int mca_pml_ob1_send_fin( ompi_proc_t* proc, int mca_pml_ob1_send_fin( ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des, opal_ptr_t hdr_frag,
uint64_t rdma_size,
uint8_t order, uint8_t order,
uint32_t status ) int status )
{ {
mca_btl_base_descriptor_t* fin; mca_btl_base_descriptor_t* fin;
mca_pml_ob1_fin_hdr_t* hdr;
int rc; int rc;
mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t), mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t),
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL); MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_SIGNAL);
if(NULL == fin) { if(NULL == fin) {
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
fin->des_cbfunc = mca_pml_ob1_fin_completion; fin->des_cbfunc = mca_pml_ob1_fin_completion;
fin->des_cbdata = NULL; fin->des_cbdata = NULL;
/* fill in header */ /* fill in header */
hdr = (mca_pml_ob1_fin_hdr_t*)fin->des_local->seg_addr.pval; mca_pml_ob1_fin_hdr_prepare ((mca_pml_ob1_fin_hdr_t *) fin->des_segments->seg_addr.pval,
hdr->hdr_common.hdr_flags = 0; 0, hdr_frag.lval, status ? status : (int64_t) rdma_size);
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des;
hdr->hdr_fail = status;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FIN, proc);
/* queue request */ /* queue request */
rc = mca_bml_base_send( bml_btl, rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN );
fin,
MCA_PML_OB1_HDR_TYPE_FIN );
if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( 1 == rc ) ) { if( OPAL_LIKELY( 1 == rc ) ) {
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -676,7 +672,7 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc,
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
mca_bml_base_free(bml_btl, fin); mca_bml_base_free(bml_btl, fin);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_frag, rdma_size, bml_btl, order, status);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
@ -717,6 +713,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
pckt->hdr.hdr_ack.hdr_src_req.lval, pckt->hdr.hdr_ack.hdr_src_req.lval,
pckt->hdr.hdr_ack.hdr_dst_req.pval, pckt->hdr.hdr_ack.hdr_dst_req.pval,
pckt->hdr.hdr_ack.hdr_send_offset, pckt->hdr.hdr_ack.hdr_send_offset,
pckt->hdr.hdr_ack.hdr_send_size,
pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA); pckt->hdr.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
@ -728,9 +725,10 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
break; break;
case MCA_PML_OB1_HDR_TYPE_FIN: case MCA_PML_OB1_HDR_TYPE_FIN:
rc = mca_pml_ob1_send_fin(pckt->proc, send_dst, rc = mca_pml_ob1_send_fin(pckt->proc, send_dst,
pckt->hdr.hdr_fin.hdr_des, pckt->hdr.hdr_fin.hdr_frag,
pckt->hdr.hdr_fin.hdr_size,
pckt->order, pckt->order,
pckt->hdr.hdr_fin.hdr_fail); pckt->status);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) { if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == rc) ) {
return; return;
} }

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -216,6 +216,7 @@ struct mca_pml_ob1_pckt_pending_t {
mca_pml_ob1_hdr_t hdr; mca_pml_ob1_hdr_t hdr;
struct mca_bml_base_btl_t *bml_btl; struct mca_bml_base_btl_t *bml_btl;
uint8_t order; uint8_t order;
int status;
}; };
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t; typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t); OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
@ -234,17 +235,17 @@ do { \
(ompi_free_list_item_t*)pckt); \ (ompi_free_list_item_t*)pckt); \
} while(0) } while(0)
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B, O, S) \ #define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, Sz, B, O, S) \
do { \ do { \
mca_pml_ob1_pckt_pending_t *_pckt; \ mca_pml_ob1_pckt_pending_t *_pckt; \
\ \
MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \ MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt); \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \ mca_pml_ob1_fin_hdr_prepare (&_pckt->hdr.hdr_fin, 0, \
_pckt->hdr.hdr_fin.hdr_des = (D); \ (D).lval, (Sz)); \
_pckt->hdr.hdr_fin.hdr_fail = (S); \
_pckt->proc = (P); \ _pckt->proc = (P); \
_pckt->bml_btl = (B); \ _pckt->bml_btl = (B); \
_pckt->order = (O); \ _pckt->order = (O); \
_pckt->status = (S); \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
opal_list_append(&mca_pml_ob1.pckt_pending, \ opal_list_append(&mca_pml_ob1.pckt_pending, \
(opal_list_item_t*)_pckt); \ (opal_list_item_t*)_pckt); \
@ -253,7 +254,7 @@ do { \
int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
opal_ptr_t hdr_des, uint8_t order, uint32_t status); opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status);
/* This function tries to resend FIN/ACK packets from pckt_pending queue. /* This function tries to resend FIN/ACK packets from pckt_pending queue.
* Packets are added to the queue when sending of FIN or ACK is failed due to * Packets are added to the queue when sending of FIN or ACK is failed due to
@ -283,20 +284,6 @@ void mca_pml_ob1_process_pending_rdma(void);
/* /*
* Compute the total number of bytes on supplied descriptor * Compute the total number of bytes on supplied descriptor
*/ */
static inline size_t
mca_pml_ob1_compute_segment_length(size_t seg_size, void *segments,
size_t count, size_t hdrlen)
{
size_t i, length = 0;
mca_btl_base_segment_t *segment = (mca_btl_base_segment_t*)segments;
for (i = 0; i < count ; ++i) {
length += segment->seg_len;
segment = (mca_btl_base_segment_t *)((char *)segment + seg_size);
}
return (length - hdrlen);
}
static inline size_t static inline size_t
mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments, mca_pml_ob1_compute_segment_length_base(mca_btl_base_segment_t *segments,
size_t count, size_t hdrlen) size_t count, size_t hdrlen)
@ -338,7 +325,7 @@ mca_pml_ob1_compute_segment_length_remote (size_t seg_size, void *segments,
/* represent BTL chosen for sending request */ /* represent BTL chosen for sending request */
struct mca_pml_ob1_com_btl_t { struct mca_pml_ob1_com_btl_t {
mca_bml_base_btl_t *bml_btl; mca_bml_base_btl_t *bml_btl;
struct mca_mpool_base_registration_t* btl_reg; struct mca_btl_base_registration_handle_t *btl_reg;
size_t length; size_t length;
}; };
typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t; typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -12,6 +13,8 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -123,19 +126,20 @@ size_t mca_pml_ob1_rdma_cuda_btls(
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n); mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) { if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
mca_mpool_base_registration_t* reg = NULL; mca_btl_base_registration_handle_t *handle = NULL;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;
if( NULL != btl_mpool ) { if( NULL != bml_btl->btl->btl_register_mem ) {
/* register the memory */ /* register the memory */
btl_mpool->mpool_register(btl_mpool, base, size, MCA_MPOOL_FLAGS_CUDA_GPU_MEM, &reg); handle = bml_btl->btl->btl_register_mem (bml_btl->btl, bml_btl->btl_endpoint,
base, size, MCA_BTL_REG_FLAG_CUDA_GPU_MEM |
MCA_BTL_REG_FLAG_REMOTE_READ);
} }
if(NULL == reg) if(NULL == handle)
continue; continue;
rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg; rdma_btls[num_btls_used].btl_reg = handle;
weight_total += bml_btl->btl_weight; weight_total += bml_btl->btl_weight;
num_btls_used++; num_btls_used++;
} }

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -64,6 +64,13 @@ struct mca_pml_ob1_common_hdr_t {
}; };
typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t; typedef struct mca_pml_ob1_common_hdr_t mca_pml_ob1_common_hdr_t;
static inline void mca_pml_ob1_common_hdr_prepare (mca_pml_ob1_common_hdr_t *hdr, uint8_t hdr_type,
uint8_t hdr_flags)
{
hdr->hdr_type = hdr_type;
hdr->hdr_flags = hdr_flags;
}
#define MCA_PML_OB1_COMMON_HDR_NTOH(h) #define MCA_PML_OB1_COMMON_HDR_NTOH(h)
#define MCA_PML_OB1_COMMON_HDR_HTON(h) #define MCA_PML_OB1_COMMON_HDR_HTON(h)
@ -89,15 +96,19 @@ struct mca_pml_ob1_match_hdr_t {
typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t; typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t;
static inline void mca_pml_ob1_match_hdr_prepare (mca_pml_ob1_match_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, hdr_type, hdr_flags);
hdr->hdr_ctx = hdr_ctx;
hdr->hdr_src = hdr_src;
hdr->hdr_tag = hdr_tag;
hdr->hdr_seq = hdr_seq;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_MATCH_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ #endif
(h).hdr_padding[1] = 0; \ }
} while(0)
#else
#define MCA_PML_OB1_MATCH_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_MATCH_HDR_NTOH(h) \ #define MCA_PML_OB1_MATCH_HDR_NTOH(h) \
do { \ do { \
@ -111,7 +122,6 @@ do { \
#define MCA_PML_OB1_MATCH_HDR_HTON(h) \ #define MCA_PML_OB1_MATCH_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_MATCH_HDR_FILL(h); \
(h).hdr_ctx = htons((h).hdr_ctx); \ (h).hdr_ctx = htons((h).hdr_ctx); \
(h).hdr_src = htonl((h).hdr_src); \ (h).hdr_src = htonl((h).hdr_src); \
(h).hdr_tag = htonl((h).hdr_tag); \ (h).hdr_tag = htonl((h).hdr_tag); \
@ -130,12 +140,14 @@ struct mca_pml_ob1_rendezvous_hdr_t {
}; };
typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t; typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG static inline void mca_pml_ob1_rendezvous_hdr_prepare (mca_pml_ob1_rendezvous_hdr_t *hdr, uint8_t hdr_type, uint8_t hdr_flags,
#define MCA_PML_OB1_RNDV_HDR_FILL(h) \ uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
MCA_PML_OB1_MATCH_HDR_FILL((h).hdr_match) uint64_t hdr_msg_length, void *hdr_src_req)
#else {
#define MCA_PML_OB1_RNDV_HDR_FILL(h) mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, hdr_type, hdr_flags, hdr_ctx, hdr_src, hdr_tag, hdr_seq);
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ hdr->hdr_msg_length = hdr_msg_length;
hdr->hdr_src_req.pval = hdr_src_req;
}
/* Note that hdr_src_req is not put in network byte order because it /* Note that hdr_src_req is not put in network byte order because it
is never processed by the receiver, other than being copied into is never processed by the receiver, other than being copied into
@ -149,7 +161,6 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
#define MCA_PML_OB1_RNDV_HDR_HTON(h) \ #define MCA_PML_OB1_RNDV_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \ MCA_PML_OB1_MATCH_HDR_HTON((h).hdr_match); \
MCA_PML_OB1_RNDV_HDR_FILL(h); \
(h).hdr_msg_length = hton64((h).hdr_msg_length); \ (h).hdr_msg_length = hton64((h).hdr_msg_length); \
} while (0) } while (0)
@ -158,38 +169,47 @@ typedef struct mca_pml_ob1_rendezvous_hdr_t mca_pml_ob1_rendezvous_hdr_t;
*/ */
struct mca_pml_ob1_rget_hdr_t { struct mca_pml_ob1_rget_hdr_t {
mca_pml_ob1_rendezvous_hdr_t hdr_rndv; mca_pml_ob1_rendezvous_hdr_t hdr_rndv;
uint32_t hdr_seg_cnt; /**< number of segments for rdma */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[4]; uint8_t hdr_padding[4];
#endif #endif
opal_ptr_t hdr_des; /**< source descriptor */ opal_ptr_t hdr_frag; /**< source fragment (for fin) */
uint64_t hdr_src_ptr; /**< source pointer */
/* btl registration handle data follows */
}; };
typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t; typedef struct mca_pml_ob1_rget_hdr_t mca_pml_ob1_rget_hdr_t;
static inline void mca_pml_ob1_rget_hdr_prepare (mca_pml_ob1_rget_hdr_t *hdr, uint8_t hdr_flags,
uint16_t hdr_ctx, int32_t hdr_src, int32_t hdr_tag, uint16_t hdr_seq,
uint64_t hdr_msg_length, void *hdr_src_req, void *hdr_frag,
void *hdr_src_ptr, void *local_handle, size_t local_handle_size)
{
mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RGET, hdr_flags,
hdr_ctx, hdr_src, hdr_tag, hdr_seq, hdr_msg_length, hdr_src_req);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RGET_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
MCA_PML_OB1_RNDV_HDR_FILL((h).hdr_rndv); \ hdr->hdr_padding[2] = 0;
(h).hdr_padding[0] = 0; \ hdr->hdr_padding[3] = 0;
(h).hdr_padding[1] = 0; \ #endif
(h).hdr_padding[2] = 0; \ hdr->hdr_frag.pval = hdr_frag;
(h).hdr_padding[3] = 0; \ hdr->hdr_src_ptr = (uint64_t)(intptr_t) hdr_src_ptr;
} while(0)
#else
#define MCA_PML_OB1_RGET_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \ /* copy registration handle */
do { \ memcpy (hdr + 1, local_handle, local_handle_size);
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \ }
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
#define MCA_PML_OB1_RGET_HDR_NTOH(h) \
do { \
MCA_PML_OB1_RNDV_HDR_NTOH((h).hdr_rndv); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_src_ptr = ntoh64((h).hdr_src_ptr); \
} while (0) } while (0)
#define MCA_PML_OB1_RGET_HDR_HTON(h) \ #define MCA_PML_OB1_RGET_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \ MCA_PML_OB1_RNDV_HDR_HTON((h).hdr_rndv); \
MCA_PML_OB1_RGET_HDR_FILL(h); \ (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \ (h).hdr_src_ptr = hton64((h).hdr_src_ptr); \
} while (0) } while (0)
/** /**
@ -206,19 +226,23 @@ struct mca_pml_ob1_frag_hdr_t {
}; };
typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t; typedef struct mca_pml_ob1_frag_hdr_t mca_pml_ob1_frag_hdr_t;
static inline void mca_pml_ob1_frag_hdr_prepare (mca_pml_ob1_frag_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag_offset, void *hdr_src_req,
uint64_t hdr_dst_req)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FRAG, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FRAG_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ hdr->hdr_padding[2] = 0;
(h).hdr_padding[1] = 0; \ hdr->hdr_padding[3] = 0;
(h).hdr_padding[2] = 0; \ hdr->hdr_padding[4] = 0;
(h).hdr_padding[3] = 0; \ hdr->hdr_padding[5] = 0;
(h).hdr_padding[4] = 0; \ #endif
(h).hdr_padding[5] = 0; \ hdr->hdr_frag_offset = hdr_frag_offset;
} while(0) hdr->hdr_src_req.pval = hdr_src_req;
#else hdr->hdr_dst_req.lval = hdr_dst_req;
#define MCA_PML_OB1_FRAG_HDR_FILL(h) }
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_FRAG_HDR_NTOH(h) \ #define MCA_PML_OB1_FRAG_HDR_NTOH(h) \
do { \ do { \
@ -229,7 +253,6 @@ do { \
#define MCA_PML_OB1_FRAG_HDR_HTON(h) \ #define MCA_PML_OB1_FRAG_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FRAG_HDR_FILL(h); \
(h).hdr_frag_offset = hton64((h).hdr_frag_offset); \ (h).hdr_frag_offset = hton64((h).hdr_frag_offset); \
} while (0) } while (0)
@ -245,38 +268,45 @@ struct mca_pml_ob1_ack_hdr_t {
opal_ptr_t hdr_src_req; /**< source request */ opal_ptr_t hdr_src_req; /**< source request */
opal_ptr_t hdr_dst_req; /**< matched receive request */ opal_ptr_t hdr_dst_req; /**< matched receive request */
uint64_t hdr_send_offset; /**< starting point of copy in/out */ uint64_t hdr_send_offset; /**< starting point of copy in/out */
uint64_t hdr_send_size; /**< number of bytes requested (0: all remaining) */
}; };
typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t; typedef struct mca_pml_ob1_ack_hdr_t mca_pml_ob1_ack_hdr_t;
static inline void mca_pml_ob1_ack_hdr_prepare (mca_pml_ob1_ack_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_send_offset, uint64_t hdr_send_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_ACK, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_ACK_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ hdr->hdr_padding[2] = 0;
(h).hdr_padding[1] = 0; \ hdr->hdr_padding[3] = 0;
(h).hdr_padding[2] = 0; \ hdr->hdr_padding[4] = 0;
(h).hdr_padding[3] = 0; \ hdr->hdr_padding[5] = 0;
(h).hdr_padding[4] = 0; \ #endif
(h).hdr_padding[5] = 0; \ hdr->hdr_src_req.lval = hdr_src_req;
} while (0) hdr->hdr_dst_req.pval = hdr_dst_req;
#else hdr->hdr_send_offset = hdr_send_offset;
#define MCA_PML_OB1_ACK_HDR_FILL(h) hdr->hdr_send_size = hdr_send_size;
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ }
/* Note that the request headers are not put in NBO because the /* Note that the request headers are not put in NBO because the
src_req is already in receiver's byte order and the dst_req is not src_req is already in receiver's byte order and the dst_req is not
used by the receiver for anything other than backpointers in return used by the receiver for anything other than backpointers in return
headers */ headers */
#define MCA_PML_OB1_ACK_HDR_NTOH(h) \ #define MCA_PML_OB1_ACK_HDR_NTOH(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_send_offset = ntoh64((h).hdr_send_offset); \ (h).hdr_send_offset = ntoh64((h).hdr_send_offset); \
(h).hdr_send_size = ntoh64((h).hdr_send_size); \
} while (0) } while (0)
#define MCA_PML_OB1_ACK_HDR_HTON(h) \ #define MCA_PML_OB1_ACK_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_ACK_HDR_FILL(h); \
(h).hdr_send_offset = hton64((h).hdr_send_offset); \ (h).hdr_send_offset = hton64((h).hdr_send_offset); \
(h).hdr_send_size = hton64((h).hdr_send_size); \
} while (0) } while (0)
/** /**
@ -288,38 +318,55 @@ struct mca_pml_ob1_rdma_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */ uint8_t hdr_padding[2]; /** two to pad out the hdr to a 4 byte alignment. hdr_req will then be 8 byte aligned after 4 for hdr_seg_cnt */
#endif #endif
uint32_t hdr_seg_cnt; /**< number of segments for rdma */ /* TODO: add real support for multiple destination segments */
opal_ptr_t hdr_req; /**< destination request */ opal_ptr_t hdr_req; /**< destination request */
opal_ptr_t hdr_des; /**< source descriptor */ opal_ptr_t hdr_frag; /**< receiver fragment */
opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */ opal_ptr_t hdr_recv_req; /**< receive request (NTH: needed for put fallback on send) */
uint64_t hdr_rdma_offset; /**< current offset into user buffer */ uint64_t hdr_rdma_offset; /**< current offset into user buffer */
mca_btl_base_segment_t hdr_segs[1]; /**< list of segments for rdma */ uint64_t hdr_dst_ptr; /**< destination address */
uint64_t hdr_dst_size; /**< destination size */
/* registration data follows */
}; };
typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t; typedef struct mca_pml_ob1_rdma_hdr_t mca_pml_ob1_rdma_hdr_t;
static inline void mca_pml_ob1_rdma_hdr_prepare (mca_pml_ob1_rdma_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_req, void *hdr_frag, void *hdr_recv_req,
uint64_t hdr_rdma_offset, void *hdr_dst_ptr,
uint64_t hdr_dst_size, void *local_handle,
size_t local_handle_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_PUT, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_RDMA_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ #endif
(h).hdr_padding[1] = 0; \ hdr->hdr_req.lval = hdr_req;
} while(0) hdr->hdr_frag.pval = hdr_frag;
#else hdr->hdr_recv_req.pval = hdr_recv_req;
#define MCA_PML_OB1_RDMA_HDR_FILL(h) hdr->hdr_rdma_offset = hdr_rdma_offset;
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */ hdr->hdr_dst_ptr = (uint64_t)(intptr_t) hdr_dst_ptr;
hdr->hdr_dst_size = hdr_dst_size;
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \ /* copy segments */
do { \ memcpy (hdr + 1, local_handle, local_handle_size);
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ }
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
#define MCA_PML_OB1_RDMA_HDR_NTOH(h) \
do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_seg_cnt = ntohl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \ (h).hdr_rdma_offset = ntoh64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = ntoh64((h).hdr_dst_ptr); \
(h).hdr_dst_size = ntoh64((h).hdr_dst_size); \
} while (0) } while (0)
#define MCA_PML_OB1_RDMA_HDR_HTON(h) \ #define MCA_PML_OB1_RDMA_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_RDMA_HDR_FILL(h); \ (h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_seg_cnt = htonl((h).hdr_seg_cnt); \
(h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \ (h).hdr_rdma_offset = hton64((h).hdr_rdma_offset); \
(h).hdr_dst_ptr = hton64((h).hdr_dst_ptr); \
(h).hdr_dst_size = hton64((h).hdr_dst_size); \
} while (0) } while (0)
/** /**
@ -331,31 +378,34 @@ struct mca_pml_ob1_fin_hdr_t {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[2]; uint8_t hdr_padding[2];
#endif #endif
uint32_t hdr_fail; /**< RDMA operation failed */ int64_t hdr_size; /**< number of bytes completed (positive), error code (negative) */
opal_ptr_t hdr_des; /**< completed descriptor */ opal_ptr_t hdr_frag; /**< completed RDMA fragment */
}; };
typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t; typedef struct mca_pml_ob1_fin_hdr_t mca_pml_ob1_fin_hdr_t;
static inline void mca_pml_ob1_fin_hdr_prepare (mca_pml_ob1_fin_hdr_t *hdr, uint8_t hdr_flags,
uint64_t hdr_frag, int64_t hdr_size)
{
mca_pml_ob1_common_hdr_prepare (&hdr->hdr_common, MCA_PML_OB1_HDR_TYPE_FIN, hdr_flags);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_OB1_FIN_HDR_FILL(h) \ hdr->hdr_padding[0] = 0;
do { \ hdr->hdr_padding[1] = 0;
(h).hdr_padding[0] = 0; \ #endif
(h).hdr_padding[1] = 0; \ hdr->hdr_frag.lval = hdr_frag;
} while (0) hdr->hdr_size = hdr_size;
#else }
#define MCA_PML_OB1_FIN_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
#define MCA_PML_OB1_FIN_HDR_NTOH(h) \ #define MCA_PML_OB1_FIN_HDR_NTOH(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_NTOH((h).hdr_common); \
(h).hdr_size = ntoh64((h).hdr_size); \
} while (0) } while (0)
#define MCA_PML_OB1_FIN_HDR_HTON(h) \ #define MCA_PML_OB1_FIN_HDR_HTON(h) \
do { \ do { \
MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \ MCA_PML_OB1_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_OB1_FIN_HDR_FILL(h); \ (h).hdr_size = hton64((h).hdr_size); \
} while (0) } while (0)
/** /**
* Union of defined hdr types. * Union of defined hdr types.

Просмотреть файл

@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007-2014 Los Alamos National Security, LLC. All rights * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science * Copyright (c) 2015 Research Organization for Information Science
@ -68,7 +68,6 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint, ompi_proc_t *dst_proc, mca_bml_base_endpoint_t* endpoint,
ompi_communicator_t * comm) ompi_communicator_t * comm)
{ {
mca_btl_base_descriptor_t *des = NULL;
mca_pml_ob1_match_hdr_t match; mca_pml_ob1_match_hdr_t match;
mca_bml_base_btl_t *bml_btl; mca_bml_base_btl_t *bml_btl;
opal_convertor_t convertor; opal_convertor_t convertor;
@ -98,28 +97,21 @@ static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
size = 0; size = 0;
} }
match.hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; comm->c_contextid, comm->c_my_rank,
match.hdr_ctx = comm->c_contextid; tag, seqn);
match.hdr_src = comm->c_my_rank;
match.hdr_tag = tag;
match.hdr_seq = seqn;
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc); ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);
/* try to send immediately */ /* try to send immediately */
rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN, rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
MCA_PML_OB1_HDR_TYPE_MATCH, &des); MCA_PML_OB1_HDR_TYPE_MATCH, NULL);
if (count > 0) { if (count > 0) {
opal_convertor_cleanup (&convertor); opal_convertor_cleanup (&convertor);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
if (des) {
mca_bml_base_free (bml_btl, des);
}
return rc; return rc;
} }
@ -224,7 +216,7 @@ int mca_pml_ob1_send(void *buf,
OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t); OBJ_CONSTRUCT(sendreq, mca_pml_ob1_send_request_t);
sendreq->req_send.req_base.req_proc = dst_proc; sendreq->req_send.req_base.req_proc = dst_proc;
sendreq->src_des = NULL; sendreq->rdma_frag = NULL;
MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
buf, buf,

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -9,6 +10,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -27,11 +30,6 @@
#include "pml_ob1.h" #include "pml_ob1.h"
#include "pml_ob1_rdma.h" #include "pml_ob1_rdma.h"
/* Use this registration if no registration needed for a BTL instead of NULL.
* This will help other code to distinguish case when memory is not registered
* from case when registration is not needed */
static mca_mpool_base_registration_t pml_ob1_dummy_reg;
/* /*
* Check to see if memory is registered or can be registered. Build a * Check to see if memory is registered or can be registered. Build a
* set of registrations on the request. * set of registrations on the request.
@ -45,7 +43,7 @@ size_t mca_pml_ob1_rdma_btls(
{ {
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma); int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
double weight_total = 0; double weight_total = 0;
int num_btls_used = 0, n; int num_btls_used = 0;
/* shortcut when there are no rdma capable btls */ /* shortcut when there are no rdma capable btls */
if(num_btls == 0) { if(num_btls == 0) {
@ -53,29 +51,33 @@ size_t mca_pml_ob1_rdma_btls(
} }
/* check to see if memory is registered */ /* check to see if memory is registered */
for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) {
n++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_t* bml_btl =
mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma, mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
(bml_endpoint->btl_rdma_index + n) % num_btls); (bml_endpoint->btl_rdma_index + n) % num_btls);
mca_mpool_base_registration_t* reg = &pml_ob1_dummy_reg; mca_btl_base_registration_handle_t *reg_handle = NULL;
mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool; mca_btl_base_module_t *btl = bml_btl->btl;
if( NULL != btl_mpool ) { if (btl->btl_register_mem) {
if(!mca_pml_ob1.leave_pinned) { /* do not use the RDMA protocol with this btl if 1) leave pinned is disabled,
/* look through existing registrations */ * 2) the btl supports put, and 3) the fragment is larger than the minimum
btl_mpool->mpool_find(btl_mpool, base, size, &reg); * pipeline size specified by the BTL */
} else { if (!mca_pml_ob1.leave_pinned && (btl->btl_flags & MCA_BTL_FLAGS_PUT) &&
/* register the memory */ size > btl->btl_min_rdma_pipeline_size) {
btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg); continue;
} }
if(NULL == reg) /* try to register the memory region with the btl */
reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base,
size, MCA_BTL_REG_FLAG_REMOTE_READ);
if (NULL == reg_handle) {
/* btl requires registration but the registration failed */
continue; continue;
} }
} /* else no registration is needed with this btl */
rdma_btls[num_btls_used].bml_btl = bml_btl; rdma_btls[num_btls_used].bml_btl = bml_btl;
rdma_btls[num_btls_used].btl_reg = reg; rdma_btls[num_btls_used].btl_reg = reg_handle;
weight_total += bml_btl->btl_weight; weight_total += bml_btl->btl_weight;
num_btls_used++; num_btls_used++;
} }
@ -83,7 +85,7 @@ size_t mca_pml_ob1_rdma_btls(
/* if we don't use leave_pinned and all BTLs that already have this memory /* if we don't use leave_pinned and all BTLs that already have this memory
* registered amount to less then half of available bandwidth - fall back to * registered amount to less then half of available bandwidth - fall back to
* pipeline protocol */ * pipeline protocol */
if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5)) if (0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
return 0; return 0;
mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size, mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
@ -103,10 +105,7 @@ size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) { for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
rdma_btls[i].bml_btl = rdma_btls[i].bml_btl =
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool) rdma_btls[i].btl_reg = NULL;
rdma_btls[i].btl_reg = NULL;
else
rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
weight_total += rdma_btls[i].bml_btl->btl_weight; weight_total += rdma_btls[i].bml_btl->btl_weight;
} }

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -9,6 +10,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -21,9 +24,13 @@
#include "pml_ob1.h" #include "pml_ob1.h"
#include "pml_ob1_rdmafrag.h" #include "pml_ob1_rdmafrag.h"
static void mca_pml_ob1_rdma_frag_constructor (mca_pml_ob1_rdma_frag_t *frag)
{
frag->local_handle = NULL;
}
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
mca_pml_ob1_rdma_frag_t, mca_pml_ob1_rdma_frag_t,
ompi_free_list_item_t, ompi_free_list_item_t,
NULL, mca_pml_ob1_rdma_frag_constructor,
NULL); NULL);

Просмотреть файл

@ -10,6 +10,8 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -32,38 +34,52 @@ typedef enum {
MCA_PML_OB1_RDMA_GET MCA_PML_OB1_RDMA_GET
} mca_pml_ob1_rdma_state_t; } mca_pml_ob1_rdma_state_t;
struct mca_pml_ob1_rdma_frag_t;
typedef void (*mca_pml_ob1_rdma_frag_callback_t)(struct mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length);
/**
* Used to keep track of local and remote RDMA operations.
*/
struct mca_pml_ob1_rdma_frag_t { struct mca_pml_ob1_rdma_frag_t {
ompi_free_list_item_t super; ompi_free_list_item_t super;
mca_bml_base_btl_t* rdma_bml; mca_bml_base_btl_t *rdma_bml;
mca_pml_ob1_hdr_t rdma_hdr; mca_pml_ob1_hdr_t rdma_hdr;
mca_pml_ob1_rdma_state_t rdma_state; mca_pml_ob1_rdma_state_t rdma_state;
size_t rdma_length; size_t rdma_length;
uint8_t rdma_segs[MCA_BTL_SEG_MAX_SIZE * MCA_BTL_DES_MAX_SEGMENTS];
void *rdma_req; void *rdma_req;
struct mca_bml_base_endpoint_t* rdma_ep;
opal_convertor_t convertor;
mca_mpool_base_registration_t* reg;
uint32_t retries; uint32_t retries;
mca_pml_ob1_rdma_frag_callback_t cbfunc;
uint64_t rdma_offset;
void *local_address;
mca_btl_base_registration_handle_t *local_handle;
uint64_t remote_address;
uint8_t remote_handle[MCA_BTL_REG_HANDLE_MAX_SIZE];
}; };
typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t; typedef struct mca_pml_ob1_rdma_frag_t mca_pml_ob1_rdma_frag_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t); OBJ_CLASS_DECLARATION(mca_pml_ob1_rdma_frag_t);
#define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \ #define MCA_PML_OB1_RDMA_FRAG_ALLOC(frag) \
do { \ do { \
ompi_free_list_item_t* item; \ ompi_free_list_item_t* item; \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \ OMPI_FREE_LIST_WAIT_MT(&mca_pml_ob1.rdma_frags, item); \
frag = (mca_pml_ob1_rdma_frag_t*)item; \ frag = (mca_pml_ob1_rdma_frag_t*)item; \
} while(0)
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
(ompi_free_list_item_t*)frag); \
} while(0) } while(0)
#define MCA_PML_OB1_RDMA_FRAG_RETURN(frag) \
do { \
/* return fragment */ \
if (frag->local_handle) { \
mca_bml_base_deregister_mem (frag->rdma_bml, frag->local_handle); \
frag->local_handle = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT(&mca_pml_ob1.rdma_frags, \
(ompi_free_list_item_t*)frag); \
} while (0)
END_C_DECLS END_C_DECLS

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2006-2008 University of Houston. All rights reserved. * Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -108,13 +108,13 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval;
ompi_communicator_t *comm_ptr; ompi_communicator_t *comm_ptr;
mca_pml_ob1_recv_request_t *match = NULL; mca_pml_ob1_recv_request_t *match = NULL;
mca_pml_ob1_comm_t *comm; mca_pml_ob1_comm_t *comm;
mca_pml_ob1_comm_proc_t *proc; mca_pml_ob1_comm_proc_t *proc;
size_t num_segments = des->des_local_count; size_t num_segments = des->des_segment_count;
size_t bytes_received = 0; size_t bytes_received = 0;
assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS); assert(num_segments <= MCA_BTL_DES_MAX_SEGMENTS);
@ -256,7 +256,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
@ -264,7 +264,7 @@ void mca_pml_ob1_recv_frag_callback_rndv(mca_btl_base_module_t* btl,
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RNDV);
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments, mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RNDV); des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RNDV);
return; return;
} }
@ -273,7 +273,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
@ -281,7 +281,7 @@ void mca_pml_ob1_recv_frag_callback_rget(mca_btl_base_module_t* btl,
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_RGET);
mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments, mca_pml_ob1_recv_frag_match(btl, &hdr->hdr_match, segments,
des->des_local_count, MCA_PML_OB1_HDR_TYPE_RGET); des->des_segment_count, MCA_PML_OB1_HDR_TYPE_RGET);
return; return;
} }
@ -292,9 +292,10 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) void* cbdata )
{ {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_send_request_t* sendreq; mca_pml_ob1_send_request_t* sendreq;
size_t size;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return; return;
@ -307,19 +308,25 @@ void mca_pml_ob1_recv_frag_callback_ack(mca_btl_base_module_t* btl,
/* if the request should be delivered entirely by copy in/out /* if the request should be delivered entirely by copy in/out
* then throttle sends */ * then throttle sends */
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) { if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
if (NULL != sendreq->src_des) { if (NULL != sendreq->rdma_frag) {
/* release registered memory */ if (NULL != sendreq->rdma_frag->local_handle) {
mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des); mca_bml_base_deregister_mem (sendreq->req_rdma[0].bml_btl, sendreq->rdma_frag->local_handle);
sendreq->src_des = NULL; sendreq->rdma_frag->local_handle = NULL;
}
MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
sendreq->rdma_frag = NULL;
} }
sendreq->req_throttle_sends = true; sendreq->req_throttle_sends = true;
} }
mca_pml_ob1_send_request_copy_in_out(sendreq, if (hdr->hdr_ack.hdr_send_size) {
hdr->hdr_ack.hdr_send_offset, size = hdr->hdr_ack.hdr_send_size;
sendreq->req_send.req_bytes_packed - } else {
hdr->hdr_ack.hdr_send_offset); size = sendreq->req_send.req_bytes_packed - hdr->hdr_ack.hdr_send_offset;
}
mca_pml_ob1_send_request_copy_in_out(sendreq, hdr->hdr_ack.hdr_send_offset, size);
if (sendreq->req_state != 0) { if (sendreq->req_state != 0) {
/* Typical receipt of an ACK message causes req_state to be /* Typical receipt of an ACK message causes req_state to be
@ -355,13 +362,14 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_recv_request_t* recvreq; mca_pml_ob1_recv_request_t* recvreq;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
return; return;
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG);
recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */ #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
@ -372,7 +380,7 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV); assert(btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV);
/* This will trigger the opal_convertor_pack to start asynchronous copy. */ /* This will trigger the opal_convertor_pack to start asynchronous copy. */
mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_local_count,des); mca_pml_ob1_recv_request_frag_copy_start(recvreq,btl,segments,des->des_segment_count,des);
/* Let BTL know that it CANNOT free the frag */ /* Let BTL know that it CANNOT free the frag */
des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC; des->des_flags |= MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC;
@ -380,7 +388,8 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl,
return; return;
} }
#endif /* OPAL_CUDA_SUPPORT */ #endif /* OPAL_CUDA_SUPPORT */
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_local_count);
mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_segment_count);
return; return;
} }
@ -390,7 +399,7 @@ void mca_pml_ob1_recv_frag_callback_put(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval;
mca_pml_ob1_send_request_t* sendreq; mca_pml_ob1_send_request_t* sendreq;
@ -410,20 +419,17 @@ void mca_pml_ob1_recv_frag_callback_fin(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag, mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des, mca_btl_base_descriptor_t* des,
void* cbdata ) { void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_local; mca_btl_base_segment_t* segments = des->des_segments;
mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_fin_hdr_t* hdr = (mca_pml_ob1_fin_hdr_t *) segments->seg_addr.pval;
mca_btl_base_descriptor_t* rdma; mca_pml_ob1_rdma_frag_t *frag;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) { if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_fin_hdr_t)) ) {
return; return;
} }
ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN); ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FIN);
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval; frag = (mca_pml_ob1_rdma_frag_t *) hdr->hdr_frag.pval;
rdma->des_cbfunc(btl, NULL, rdma, frag->cbfunc (frag, hdr->hdr_size);
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
return;
} }
@ -699,7 +705,7 @@ out_of_order_match:
OPAL_THREAD_UNLOCK(&comm->matching_lock); OPAL_THREAD_UNLOCK(&comm->matching_lock);
if(OPAL_LIKELY(match)) { if(OPAL_LIKELY(match)) {
switch(type) { switch(type) {
case MCA_PML_OB1_HDR_TYPE_MATCH: case MCA_PML_OB1_HDR_TYPE_MATCH:
mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments); mca_pml_ob1_recv_request_progress_match(match, btl, segments, num_segments);
break; break;

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2012 FUJITSU LIMITED. All rights reserved. * Copyright (c) 2012 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
@ -150,12 +150,17 @@ static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* reque
request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free; request->req_recv.req_base.req_ompi.req_free = mca_pml_ob1_recv_request_free;
request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel; request->req_recv.req_base.req_ompi.req_cancel = mca_pml_ob1_recv_request_cancel;
request->req_rdma_cnt = 0; request->req_rdma_cnt = 0;
request->local_handle = NULL;
OBJ_CONSTRUCT(&request->lock, opal_mutex_t); OBJ_CONSTRUCT(&request->lock, opal_mutex_t);
} }
static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request) static void mca_pml_ob1_recv_request_destruct(mca_pml_ob1_recv_request_t* request)
{ {
OBJ_DESTRUCT(&request->lock); OBJ_DESTRUCT(&request->lock);
if (OPAL_UNLIKELY(request->local_handle)) {
mca_bml_base_deregister_mem (request->rdma_bml, request->local_handle);
request->local_handle = NULL;
}
} }
OBJ_CLASS_INSTANCE( OBJ_CLASS_INSTANCE(
@ -183,31 +188,27 @@ static void mca_pml_ob1_recv_ctl_completion( mca_btl_base_module_t* btl,
* Put operation has completed remotely - update request status * Put operation has completed remotely - update request status
*/ */
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_size)
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{ {
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_cbdata; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
size_t bytes_received = 0;
if( OPAL_LIKELY(status == OMPI_SUCCESS) ) {
bytes_received = mca_pml_ob1_compute_segment_length (btl->btl_seg_size,
(void *) des->des_local,
des->des_local_count, 0);
}
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
mca_bml_base_free(bml_btl, des); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
/* check completion status */ if (OPAL_LIKELY(0 < rdma_size)) {
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); assert (rdma_size == frag->rdma_length);
if(recv_request_pml_complete_check(recvreq) == false &&
/* check completion status */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size);
if (recv_request_pml_complete_check(recvreq) == false &&
recvreq->req_rdma_offset < recvreq->req_send_offset) { recvreq->req_rdma_offset < recvreq->req_send_offset) {
/* schedule additional rdma operations */ /* schedule additional rdma operations */
mca_pml_ob1_recv_request_schedule(recvreq, bml_btl); mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
}
} }
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
@ -218,7 +219,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_recv_request_ack_send_btl( int mca_pml_ob1_recv_request_ack_send_btl(
ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma) uint64_t size, bool nordma)
{ {
mca_btl_base_descriptor_t* des; mca_btl_base_descriptor_t* des;
mca_pml_ob1_ack_hdr_t* ack; mca_pml_ob1_ack_hdr_t* ack;
@ -234,12 +235,9 @@ int mca_pml_ob1_recv_request_ack_send_btl(
} }
/* fill out header */ /* fill out header */
ack = (mca_pml_ob1_ack_hdr_t*)des->des_local->seg_addr.pval; ack = (mca_pml_ob1_ack_hdr_t*)des->des_segments->seg_addr.pval;
ack->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK; mca_pml_ob1_ack_hdr_prepare (ack, nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0,
ack->hdr_common.hdr_flags = nordma ? MCA_PML_OB1_HDR_FLAGS_NORDMA : 0; hdr_src_req, hdr_dst_req, hdr_send_offset, size);
ack->hdr_src_req.lval = hdr_src_req;
ack->hdr_dst_req.pval = hdr_dst_req;
ack->hdr_send_offset = hdr_send_offset;
ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc); ob1_hdr_hton(ack, MCA_PML_OB1_HDR_TYPE_ACK, proc);
@ -313,63 +311,99 @@ static int mca_pml_ob1_recv_request_ack(
if(recvreq->req_send_offset == hdr->hdr_msg_length) if(recvreq->req_send_offset == hdr->hdr_msg_length)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* let know to shedule function there is no need to put ACK flag */ /* let know to shedule function there is no need to put ACK flag */
recvreq->req_ack_sent = true; recvreq->req_ack_sent = true;
return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval, return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval,
recvreq, recvreq->req_send_offset, recvreq, recvreq->req_send_offset, 0,
recvreq->req_send_offset == bytes_received); recvreq->req_send_offset == bytes_received);
} }
static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag);
static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
if (OMPI_ERR_NOT_AVAILABLE == rc) {
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_recv_request_put_frag (frag);
if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append (&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
}
if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
/* tell peer to fall back on send for this region */
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, frag->rdma_offset, frag->rdma_length, false);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
}
/** /**
* Return resources used by the RDMA * Return resources used by the RDMA
*/ */
static void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_endpoint_t* ep, void *local_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des, void *context, void *cbdata, int status)
int status )
{ {
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req; mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
/* TSW - FIX */ status = mca_pml_ob1_recv_request_get_frag_failed (frag, status);
OMPI_ERROR_LOG(status); if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
ompi_rte_abort(-1, NULL); /* TSW - FIX */
OMPI_ERROR_LOG(status);
ompi_rte_abort(-1, NULL);
}
} else {
/* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
/* TODO: re-add order */
mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag,
frag->rdma_length, 0, 0);
recv_request_pml_complete_check(recvreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
} }
/* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
if (recvreq->req_recv.req_bytes_packed <= recvreq->req_bytes_received) {
mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rget.hdr_des,
des->order, 0);
}
recv_request_pml_complete_check(recvreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, static int mca_pml_ob1_recv_request_put_frag (mca_pml_ob1_rdma_frag_t *frag)
mca_btl_base_descriptor_t *dst) { {
mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req; mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *ctl; mca_btl_base_descriptor_t *ctl;
mca_pml_ob1_rdma_hdr_t *hdr; mca_pml_ob1_rdma_hdr_t *hdr;
size_t seg_size; size_t reg_size;
int rc; int rc;
seg_size = bml_btl->btl->btl_seg_size * dst->des_local_count; reg_size = bml_btl->btl->btl_registration_handle_size;
/* prepare a descriptor for rdma control message */ /* prepare a descriptor for rdma control message */
mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + seg_size, mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof (mca_pml_ob1_rdma_hdr_t) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL); MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL);
if (OPAL_UNLIKELY(NULL == ctl)) { if (OPAL_UNLIKELY(NULL == ctl)) {
@ -378,26 +412,19 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion; ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
/* fill in rdma header */ /* fill in rdma header */
hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_local->seg_addr.pval; hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; mca_pml_ob1_rdma_hdr_prepare (hdr, (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0,
hdr->hdr_common.hdr_flags = recvreq->remote_req_send.lval, frag, recvreq, frag->rdma_offset,
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0; frag->local_address, frag->rdma_length, frag->local_handle,
reg_size);
hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req; frag->cbfunc = mca_pml_ob1_put_completion;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_des.pval = dst;
hdr->hdr_recv_req.pval = recvreq;
hdr->hdr_seg_cnt = dst->des_local_count; recvreq->req_ack_sent = true;
/* copy segments */ PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
memcpy (hdr + 1, dst->des_local, seg_size); &(recvreq->req_recv.req_base), size,
PERUSE_RECV);
dst->des_cbfunc = mca_pml_ob1_put_completion;
dst->des_cbdata = recvreq;
if (!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;
/* send rdma request to peer */ /* send rdma request to peer */
rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
@ -412,71 +439,30 @@ static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
/* /*
* *
*/ */
int mca_pml_ob1_recv_request_get_frag( mca_pml_ob1_rdma_frag_t* frag ) int mca_pml_ob1_recv_request_get_frag (mca_pml_ob1_rdma_frag_t *frag)
{ {
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)frag->rdma_req; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t* descriptor;
size_t save_size = frag->rdma_length;
int rc; int rc;
/* prepare descriptor */ /* prepare descriptor */
mca_bml_base_prepare_dst( bml_btl, if (bml_btl->btl->btl_register_mem && !frag->local_handle) {
NULL, mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, MCA_BTL_REG_FLAG_LOCAL_WRITE |
&recvreq->req_recv.req_base.req_convertor, MCA_BTL_REG_FLAG_REMOTE_WRITE, &frag->local_handle);
MCA_BTL_NO_ORDER, if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
0, return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
&frag->rdma_length,
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
MCA_BTL_DES_FLAGS_GET,
&descriptor );
if( OPAL_UNLIKELY(NULL == descriptor) ) {
if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
frag->rdma_length = save_size;
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
ompi_proc_t *proc = (ompi_proc_t *) recvreq->req_recv.req_base.req_proc;
/* tell peer to fall back on send */
recvreq->req_send_offset = 0;
rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
recvreq, recvreq->req_send_offset, true);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
return rc;
} }
} }
descriptor->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
descriptor->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
descriptor->des_cbfunc = mca_pml_ob1_rget_completion;
descriptor->des_cbdata = frag;
PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE, PERUSE_TRACE_COMM_OMPI_EVENT(PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), &(((mca_pml_ob1_recv_request_t *) frag->rdma_req)->req_recv.req_base),
frag->rdma_length, PERUSE_RECV); frag->rdma_length, PERUSE_RECV);
/* queue up get request */ /* queue up get request */
rc = mca_bml_base_get(bml_btl,descriptor); rc = mca_bml_base_get (bml_btl, frag->local_address, frag->remote_address, frag->local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, MCA_BTL_NO_ORDER, mca_pml_ob1_rget_completion, frag);
if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) { return mca_pml_ob1_recv_request_get_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
/* get isn't supported for this transfer. tell peer to fallback on put */
rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
}
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
mca_bml_base_free(bml_btl, descriptor);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending,
(opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -502,6 +488,7 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq
bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments, bytes_received = mca_pml_ob1_compute_segment_length_base (segments, num_segments,
sizeof(mca_pml_ob1_frag_hdr_t)); sizeof(mca_pml_ob1_frag_hdr_t));
data_offset = hdr->hdr_frag.hdr_frag_offset; data_offset = hdr->hdr_frag.hdr_frag_offset;
/* /*
* Make user buffer accessible(defined) before unpacking. * Make user buffer accessible(defined) before unpacking.
*/ */
@ -573,7 +560,7 @@ void mca_pml_ob1_recv_request_frag_copy_start( mca_pml_ob1_recv_request_t* recvr
/* Store the receive request in unused context pointer. */ /* Store the receive request in unused context pointer. */
des->des_context = (void *)recvreq; des->des_context = (void *)recvreq;
/* Store the amount of bytes in unused remote count value */ /* Store the amount of bytes in unused remote count value */
des->des_remote_count = bytes_delivered; des->des_segment_count = bytes_delivered;
/* Then record an event that will get triggered by a PML progress call which /* Then record an event that will get triggered by a PML progress call which
* checks the stream events. If we get an error, abort. Should get message * checks the stream events. If we get an error, abort. Should get message
* from CUDA code about what went wrong. */ * from CUDA code about what went wrong. */
@ -598,7 +585,7 @@ void mca_pml_ob1_recv_request_frag_copy_finished( mca_btl_base_module_t* btl,
int status ) int status )
{ {
mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context; mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)des->des_context;
size_t bytes_received = des->des_remote_count; size_t bytes_received = des->des_segment_count;
OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des)); OPAL_OUTPUT((-1, "frag_copy_finished (delivered=%d), frag=%p", (int)bytes_received, (void *)des));
/* Call into the BTL so it can free the descriptor. At this point, it is /* Call into the BTL so it can free the descriptor. At this point, it is
@ -629,7 +616,6 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_rget_hdr_t* hdr = (mca_pml_ob1_rget_hdr_t*)segments->seg_addr.pval;
mca_bml_base_endpoint_t* bml_endpoint = NULL; mca_bml_base_endpoint_t* bml_endpoint = NULL;
size_t bytes_remaining, prev_sent, offset; size_t bytes_remaining, prev_sent, offset;
mca_btl_base_segment_t *r_segments;
mca_pml_ob1_rdma_frag_t *frag; mca_pml_ob1_rdma_frag_t *frag;
mca_bml_base_btl_t *rdma_bml; mca_bml_base_btl_t *rdma_bml;
int rc; int rc;
@ -637,6 +623,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
prev_sent = offset = 0; prev_sent = offset = 0;
bytes_remaining = hdr->hdr_rndv.hdr_msg_length; bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
recvreq->req_send_offset = 0;
MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match); MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
@ -680,8 +667,10 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
} }
bytes_remaining = mca_pml_ob1_compute_segment_length_remote (btl->btl_seg_size, (void *)(hdr + 1), bytes_remaining = hdr->hdr_rndv.hdr_msg_length;
hdr->hdr_seg_cnt, recvreq->req_recv.req_base.req_proc);
/* save the request for put fallback */
recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
/* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num /* The while loop adds a fragmentation mechanism. The variable bytes_remaining holds the num
* of bytes left to be send. In each iteration we send the max possible bytes supported * of bytes left to be send. In each iteration we send the max possible bytes supported
@ -690,7 +679,12 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
* the next iteration with the updated size. * the next iteration with the updated size.
* Also - In each iteration we update the location in the buffer to be used for writing * Also - In each iteration we update the location in the buffer to be used for writing
* the message ,and the location to read from. This is done using the offset variable that * the message ,and the location to read from. This is done using the offset variable that
* accumulates the number of bytes that were sent so far. */ * accumulates the number of bytes that were sent so far.
*
* NTH: This fragmentation may go away if we change the btls to require them to handle
* get fragmentation internally. This is a reasonable solution since some btls do not
* need any fragmentation (sm, vader, self, etc). Remove this loop if this ends up
* being the case. */
while (bytes_remaining > 0) { while (bytes_remaining > 0) {
/* allocate/initialize a fragment */ /* allocate/initialize a fragment */
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
@ -700,29 +694,31 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
} }
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs)); memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt); /* update the read location */
frag->remote_address = hdr->hdr_src_ptr + offset;
/* update the read location -- NTH: note this will only work if there is exactly one
segment. TODO -- make this work with multiple segments */
r_segments = (mca_btl_base_segment_t *) frag->rdma_segs;
r_segments->seg_addr.lval += offset;
/* updating the write location */ /* updating the write location */
OPAL_THREAD_LOCK(&recvreq->lock); OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset); opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, &offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &frag->local_address);
OPAL_THREAD_UNLOCK(&recvreq->lock); OPAL_THREAD_UNLOCK(&recvreq->lock);
frag->rdma_bml = rdma_bml; frag->rdma_bml = rdma_bml;
frag->rdma_hdr.hdr_rget = *hdr; frag->rdma_hdr.hdr_rget = *hdr;
frag->retries = 0; frag->retries = 0;
frag->rdma_req = recvreq; frag->rdma_req = recvreq;
frag->rdma_ep = bml_endpoint; frag->rdma_state = MCA_PML_OB1_RDMA_GET;
frag->rdma_state = MCA_PML_OB1_RDMA_GET; frag->local_handle = NULL;
frag->reg = NULL; frag->rdma_offset = offset;
frag->rdma_length = bytes_remaining;
if (bytes_remaining > rdma_bml->btl->btl_get_limit) {
frag->rdma_length = rdma_bml->btl->btl_get_limit;
} else {
frag->rdma_length = bytes_remaining;
}
/* NTH: TODO -- handle error conditions gracefully */ /* NTH: TODO -- handle error conditions gracefully */
rc = mca_pml_ob1_recv_request_get_frag(frag); rc = mca_pml_ob1_recv_request_get_frag(frag);
@ -921,13 +917,11 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
while(bytes_remaining > 0 && while(bytes_remaining > 0 &&
recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) { recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
size_t size, seg_size; mca_pml_ob1_rdma_frag_t *frag = NULL;
mca_pml_ob1_rdma_hdr_t* hdr; mca_btl_base_module_t *btl;
mca_btl_base_descriptor_t* dst;
mca_btl_base_descriptor_t* ctl;
mca_mpool_base_registration_t * reg = NULL;
mca_btl_base_module_t* btl;
int rc, rdma_idx; int rc, rdma_idx;
void *data_ptr;
size_t size;
if(prev_bytes_remaining == bytes_remaining) { if(prev_bytes_remaining == bytes_remaining) {
if(++num_fail == num_tries) { if(++num_fail == num_tries) {
@ -948,86 +942,62 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
do { do {
rdma_idx = recvreq->req_rdma_idx; rdma_idx = recvreq->req_rdma_idx;
bml_btl = recvreq->req_rdma[rdma_idx].bml_btl; bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
reg = recvreq->req_rdma[rdma_idx].btl_reg;
size = recvreq->req_rdma[rdma_idx].length; size = recvreq->req_rdma[rdma_idx].length;
if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt) if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
recvreq->req_rdma_idx = 0; recvreq->req_rdma_idx = 0;
} while(!size); } while(!size);
btl = bml_btl->btl; btl = bml_btl->btl;
/* makes sure that we don't exceed BTL max rdma size /* NTH: This conditional used to check if there was a registration in
* if memory is not pinned already */ * recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to
if( (NULL == reg) && (btl->btl_rdma_pipeline_frag_size != 0) && * the btl not needed registration (equivalent to btl->btl_register_mem
(size > btl->btl_rdma_pipeline_frag_size)) { * != NULL. This new check is equivalent. Note: I feel this protocol
* needs work to better improve resource usage when running with a
* leave pinned protocol. */
if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) &&
(size > btl->btl_rdma_pipeline_frag_size)) {
size = btl->btl_rdma_pipeline_frag_size; size = btl->btl_rdma_pipeline_frag_size;
} }
/* take lock to protect converter against concurrent access MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
continue;
}
/* take lock to protect convertor against concurrent access
* from unpack */ * from unpack */
OPAL_THREAD_LOCK(&recvreq->lock); OPAL_THREAD_LOCK(&recvreq->lock);
opal_convertor_set_position( &recvreq->req_recv.req_base.req_convertor, opal_convertor_set_position (&recvreq->req_recv.req_base.req_convertor,
&recvreq->req_rdma_offset ); &recvreq->req_rdma_offset);
opal_convertor_get_current_pointer (&recvreq->req_recv.req_base.req_convertor, &data_ptr);
/* prepare a descriptor for RDMA */
mca_bml_base_prepare_dst(bml_btl, reg,
&recvreq->req_recv.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_PUT, &dst);
OPAL_THREAD_UNLOCK(&recvreq->lock); OPAL_THREAD_UNLOCK(&recvreq->lock);
if(OPAL_UNLIKELY(dst == NULL)) { if (btl->btl_register_mem) {
continue; mca_bml_base_register_mem (bml_btl, data_ptr, size, MCA_BTL_REG_FLAG_REMOTE_WRITE,
&frag->local_handle);
if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
continue;
}
} }
dst->des_cbfunc = mca_pml_ob1_put_completion; /* fill in the minimum information needed to handle the fin message */
dst->des_cbdata = recvreq; frag->cbfunc = mca_pml_ob1_put_completion;
frag->rdma_length = size;
frag->rdma_req = recvreq;
frag->rdma_bml = bml_btl;
frag->local_address = data_ptr;
frag->rdma_offset = recvreq->req_rdma_offset;
seg_size = btl->btl_seg_size * dst->des_local_count; rc = mca_pml_ob1_recv_request_put_frag (frag);
if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
/* prepare a descriptor for rdma control message */
mca_bml_base_alloc(bml_btl, &ctl, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rdma_hdr_t) + seg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_SEND_ALWAYS_CALLBACK | MCA_BTL_DES_FLAGS_SIGNAL);
if( OPAL_UNLIKELY(NULL == ctl) ) {
mca_bml_base_free(bml_btl,dst);
continue;
}
ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
/* fill in rdma header */
hdr = (mca_pml_ob1_rdma_hdr_t*)ctl->des_local->seg_addr.pval;
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
hdr->hdr_req = recvreq->remote_req_send;
hdr->hdr_des.pval = dst;
hdr->hdr_recv_req.pval = recvreq;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_seg_cnt = dst->des_local_count;
/* copy segments */
memmove (hdr + 1, dst->des_local, seg_size);
if(!recvreq->req_ack_sent)
recvreq->req_ack_sent = true;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_PUT, recvreq->req_recv.req_base.req_proc);
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(recvreq->req_recv.req_base), size,
PERUSE_RECV);
/* send rdma request to peer */
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
if( OPAL_LIKELY( rc >= 0 ) ) {
/* update request state */ /* update request state */
recvreq->req_rdma_offset += size; recvreq->req_rdma_offset += size;
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
recvreq->req_rdma[rdma_idx].length -= size; recvreq->req_rdma[rdma_idx].length -= size;
bytes_remaining -= size; bytes_remaining -= size;
} else { } else {
mca_bml_base_free(bml_btl,ctl); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
mca_bml_base_free(bml_btl,dst);
} }
} }

Просмотреть файл

@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
@ -10,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
@ -131,7 +132,7 @@ do { \
#define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \ #define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \
{ \ { \
MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \ MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \ OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_recv_requests, \
(ompi_free_list_item_t*)(recvreq)); \ (ompi_free_list_item_t*)(recvreq)); \
} }
@ -154,9 +155,11 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
} }
for(i = 0; i < recvreq->req_rdma_cnt; i++) { for(i = 0; i < recvreq->req_rdma_cnt; i++) {
mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg; struct mca_btl_base_registration_handle_t *handle = recvreq->req_rdma[i].btl_reg;
if( NULL != btl_reg && btl_reg->mpool != NULL) { mca_bml_base_btl_t *bml_btl = recvreq->req_rdma[i].bml_btl;
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
} }
} }
recvreq->req_rdma_cnt = 0; recvreq->req_rdma_cnt = 0;
@ -178,6 +181,10 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR =
MPI_ERR_TRUNCATE; MPI_ERR_TRUNCATE;
} }
if (OPAL_UNLIKELY(recvreq->local_handle)) {
mca_bml_base_deregister_mem (recvreq->rdma_bml, recvreq->local_handle);
recvreq->local_handle = NULL;
}
MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq); MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq);
} }
OPAL_THREAD_UNLOCK(&ompi_request_lock); OPAL_THREAD_UNLOCK(&ompi_request_lock);
@ -387,7 +394,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
(void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl); (void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
} }
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \ #define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O, Sz) \
do { \ do { \
mca_pml_ob1_pckt_pending_t *_pckt; \ mca_pml_ob1_pckt_pending_t *_pckt; \
\ \
@ -396,6 +403,7 @@ static inline void mca_pml_ob1_recv_request_schedule(
_pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \ _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \
_pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \ _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \
_pckt->hdr.hdr_ack.hdr_send_offset = (O); \ _pckt->hdr.hdr_ack.hdr_send_offset = (O); \
_pckt->hdr.hdr_ack.hdr_send_size = (Sz); \
_pckt->proc = (P); \ _pckt->proc = (P); \
_pckt->bml_btl = NULL; \ _pckt->bml_btl = NULL; \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
@ -406,11 +414,11 @@ static inline void mca_pml_ob1_recv_request_schedule(
int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc, int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req,
uint64_t hdr_rdma_offset, bool nordma); uint64_t hdr_rdma_offset, uint64_t size, bool nordma);
static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset,
bool nordma) uint64_t size, bool nordma)
{ {
size_t i; size_t i;
mca_bml_base_btl_t* bml_btl; mca_bml_base_btl_t* bml_btl;
@ -420,12 +428,12 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc,
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req,
hdr_dst_req, hdr_send_offset, nordma) == OMPI_SUCCESS) hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req, MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,
hdr_send_offset); hdr_send_offset, size);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -137,6 +137,7 @@ static void mca_pml_ob1_send_request_construct(mca_pml_ob1_send_request_t* req)
req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel; req->req_send.req_base.req_ompi.req_cancel = mca_pml_ob1_send_request_cancel;
req->req_rdma_cnt = 0; req->req_rdma_cnt = 0;
req->req_throttle_sends = false; req->req_throttle_sends = false;
req->rdma_frag = NULL;
OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t); OBJ_CONSTRUCT(&req->req_send_ranges, opal_list_t);
OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t); OBJ_CONSTRUCT(&req->req_send_range_lock, opal_mutex_t);
} }
@ -145,6 +146,10 @@ static void mca_pml_ob1_send_request_destruct(mca_pml_ob1_send_request_t* req)
{ {
OBJ_DESTRUCT(&req->req_send_ranges); OBJ_DESTRUCT(&req->req_send_ranges);
OBJ_DESTRUCT(&req->req_send_range_lock); OBJ_DESTRUCT(&req->req_send_range_lock);
if (req->rdma_frag) {
MCA_PML_OB1_RDMA_FRAG_RETURN(req->rdma_frag);
req->rdma_frag = NULL;
}
} }
OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t, OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t,
@ -236,10 +241,9 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
* happens in one thread, the increase of the req_bytes_delivered does not * happens in one thread, the increase of the req_bytes_delivered does not
* have to be atomic. * have to be atomic.
*/ */
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
(void *) des->des_local, des->des_segment_count,
des->des_local_count, sizeof(mca_pml_ob1_rendezvous_hdr_t));
sizeof(mca_pml_ob1_rendezvous_hdr_t));
mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered ); mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
} }
@ -250,27 +254,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl,
*/ */
static void static void
mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, mca_pml_ob1_rget_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t rdma_length)
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* des,
int status )
{ {
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
size_t req_bytes_delivered;
/* count bytes of user data actually delivered and check for request completion */ /* count bytes of user data actually delivered and check for request completion */
if (OPAL_LIKELY(OMPI_SUCCESS == status)) { if (OPAL_LIKELY(0 < rdma_length)) {
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, (size_t) rdma_length);
(void *) des->des_local,
des->des_local_count, 0);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
} }
sendreq->src_des = NULL;
send_request_pml_complete_check(sendreq); send_request_pml_complete_check(sendreq);
/* free the descriptor */
mca_bml_base_free(bml_btl, des);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
} }
@ -314,10 +309,9 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
} }
/* count bytes of user data actually delivered */ /* count bytes of user data actually delivered */
req_bytes_delivered = mca_pml_ob1_compute_segment_length (btl->btl_seg_size, req_bytes_delivered = mca_pml_ob1_compute_segment_length_base ((void *) des->des_segments,
(void *) des->des_local, des->des_segment_count,
des->des_local_count, sizeof(mca_pml_ob1_frag_hdr_t));
sizeof(mca_pml_ob1_frag_hdr_t));
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
@ -389,7 +383,7 @@ int mca_pml_ob1_send_request_start_buffered(
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* pack the data into the BTL supplied buffer */ /* pack the data into the BTL supplied buffer */
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
@ -408,17 +402,14 @@ int mca_pml_ob1_send_request_start_buffered(
/* build rendezvous header */ /* build rendezvous header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, 0,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; sendreq->req_send.req_bytes_packed, sendreq);
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* update lengths */ /* update lengths */
segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data; segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data;
@ -491,15 +482,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
if(NULL != bml_btl->btl->btl_sendi) { if(NULL != bml_btl->btl->btl_sendi) {
mca_pml_ob1_match_hdr_t match; mca_pml_ob1_match_hdr_t match;
match.hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; sendreq->req_send.req_base.req_comm->c_contextid,
match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence);
match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, ob1_hdr_hton (&match, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* try to send immediately */ /* try to send immediately */
rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor, rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor,
@ -532,7 +521,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
if(size > 0) { if(size > 0) {
/* pack the data into the supplied buffer */ /* pack the data into the supplied buffer */
@ -566,15 +555,13 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
/* build match header */ /* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence);
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* update lengths */ /* update lengths */
segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data; segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data;
@ -618,7 +605,6 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
/* prepare descriptor */ /* prepare descriptor */
mca_bml_base_prepare_src( bml_btl, mca_bml_base_prepare_src( bml_btl,
NULL,
&sendreq->req_send.req_base.req_convertor, &sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, MCA_BTL_NO_ORDER,
OMPI_PML_OB1_MATCH_HDR_LEN, OMPI_PML_OB1_MATCH_HDR_LEN,
@ -628,19 +614,17 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* build match header */ /* build match header */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_match_hdr_prepare (&hdr->hdr_match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence);
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* short message */ /* short message */
des->des_cbfunc = mca_pml_ob1_match_completion_free; des->des_cbfunc = mca_pml_ob1_match_completion_free;
@ -674,80 +658,68 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
* one RDMA capable BTLs). This way round robin distribution of RDMA * one RDMA capable BTLs). This way round robin distribution of RDMA
* operation is achieved. * operation is achieved.
*/ */
mca_btl_base_registration_handle_t *local_handle;
mca_btl_base_descriptor_t *des, *src = NULL; mca_btl_base_descriptor_t *des;
mca_pml_ob1_rdma_frag_t *frag;
mca_pml_ob1_rget_hdr_t *hdr; mca_pml_ob1_rget_hdr_t *hdr;
size_t seg_size; size_t reg_size;
void *data_ptr;
int rc; int rc;
sendreq->src_des = NULL;
bml_btl = sendreq->req_rdma[0].bml_btl; bml_btl = sendreq->req_rdma[0].bml_btl;
if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) { if (!(bml_btl->btl_flags & (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
sendreq->rdma_frag = NULL;
/* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */ /* This BTL does not support get. Use rendezvous to start the RDMA operation using put instead. */
return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG | return mca_pml_ob1_send_request_start_rndv (sendreq, bml_btl, 0, MCA_PML_OB1_HDR_FLAGS_CONTIG |
MCA_PML_OB1_HDR_FLAGS_PIN); MCA_PML_OB1_HDR_FLAGS_PIN);
} }
MEMCHECKER( /* at this time ob1 does not support non-contiguous gets. the convertor represents a
memchecker_call(&opal_memchecker_base_mem_defined, * contiguous block of memory */
sendreq->req_send.req_base.req_addr, opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype); local_handle = sendreq->req_rdma[0].btl_reg;
);
/* prepare source descriptor/segment(s) */ /* allocate an rdma fragment to keep track of the request size for use in the fin message */
/* PML owns this descriptor and will free it in */ MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
/* mca_pml_ob1_rget_completion */ if (OPAL_UNLIKELY(NULL == frag)) {
mca_bml_base_prepare_src( bml_btl, sendreq->req_rdma[0].btl_reg, return OPAL_ERR_OUT_OF_RESOURCE;
&sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, 0, &size, MCA_BTL_DES_FLAGS_GET |
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &src );
MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess,
sendreq->req_send.req_base.req_addr,
sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype);
);
if( OPAL_UNLIKELY(NULL == src) ) {
return OMPI_ERR_OUT_OF_RESOURCE;
} }
src->des_cbfunc = mca_pml_ob1_rget_completion;
src->des_cbdata = sendreq;
sendreq->src_des = src; /* fill in necessary fragment data */
frag->rdma_req = sendreq;
frag->rdma_bml = bml_btl;
frag->rdma_length = size;
frag->cbfunc = mca_pml_ob1_rget_completion;
/* do not store the local handle in the fragment. it will be released by mca_pml_ob1_free_rdma_resources */
seg_size = bml_btl->btl->btl_seg_size * src->des_local_count; /* save the fragment for get->put fallback */
sendreq->rdma_frag = frag;
reg_size = bml_btl->btl->btl_registration_handle_size;
/* allocate space for get hdr + segment list */ /* allocate space for get hdr + segment list */
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + seg_size, mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof (*hdr) + reg_size,
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
MCA_BTL_DES_FLAGS_SIGNAL); MCA_BTL_DES_FLAGS_SIGNAL);
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
/* NTH: no need to reset the converter here. it will be reset before it is retried */ /* NTH: no need to reset the converter here. it will be reset before it is retried */
mca_bml_base_free(bml_btl, src);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
/* build match header */ /* build match header */
hdr = (mca_pml_ob1_rget_hdr_t *) des->des_local->seg_addr.pval; hdr = (mca_pml_ob1_rget_hdr_t *) des->des_segments->seg_addr.pval;
/* TODO -- Add support for multiple segments for get */
hdr->hdr_rndv.hdr_match.hdr_common.hdr_flags = MCA_PML_OB1_HDR_FLAGS_CONTIG|MCA_PML_OB1_HDR_FLAGS_PIN; mca_pml_ob1_rget_hdr_prepare (hdr, MCA_PML_OB1_HDR_FLAGS_CONTIG | MCA_PML_OB1_HDR_FLAGS_PIN,
hdr->hdr_rndv.hdr_match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RGET; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_rndv.hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_rndv.hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_tag,
hdr->hdr_rndv.hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; (uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_rndv.hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; sendreq->req_send.req_bytes_packed, sendreq,
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; frag, data_ptr, local_handle, reg_size);
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
hdr->hdr_des.pval = src;
hdr->hdr_seg_cnt = src->des_local_count;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc); ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RGET, sendreq->req_send.req_base.req_proc);
/* copy segment data */
memcpy (hdr + 1, src->des_local, seg_size);
des->des_cbfunc = mca_pml_ob1_send_ctl_completion; des->des_cbfunc = mca_pml_ob1_send_ctl_completion;
des->des_cbdata = sendreq; des->des_cbdata = sendreq;
@ -765,12 +737,6 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET); rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RGET);
if (OPAL_UNLIKELY(rc < 0)) { if (OPAL_UNLIKELY(rc < 0)) {
mca_bml_base_free(bml_btl, des); mca_bml_base_free(bml_btl, des);
if (sendreq->src_des) {
mca_bml_base_free (bml_btl, sendreq->src_des);
sendreq->src_des = NULL;
}
return rc; return rc;
} }
@ -808,7 +774,6 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_send.req_base.req_datatype); sendreq->req_send.req_base.req_datatype);
); );
mca_bml_base_prepare_src( bml_btl, mca_bml_base_prepare_src( bml_btl,
NULL,
&sendreq->req_send.req_base.req_convertor, &sendreq->req_send.req_base.req_convertor,
MCA_BTL_NO_ORDER, MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_rendezvous_hdr_t), sizeof(mca_pml_ob1_rendezvous_hdr_t),
@ -827,21 +792,19 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq,
if( OPAL_UNLIKELY(NULL == des) ) { if( OPAL_UNLIKELY(NULL == des) ) {
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
segment = des->des_local; segment = des->des_segments;
/* build hdr */ /* build hdr */
hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval;
hdr->hdr_common.hdr_flags = flags | MCA_PML_OB1_HDR_FLAGS_SIGNAL; mca_pml_ob1_rendezvous_hdr_prepare (&hdr->hdr_rndv, MCA_PML_OB1_HDR_TYPE_RNDV, flags |
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_RNDV; MCA_PML_OB1_HDR_FLAGS_SIGNAL,
hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; sendreq->req_send.req_base.req_tag,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; (uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; sendreq->req_send.req_bytes_packed, sendreq);
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc);
sendreq->req_send.req_base.req_proc);
/* first fragment of a long message */ /* first fragment of a long message */
des->des_cbdata = sendreq; des->des_cbdata = sendreq;
@ -1022,13 +985,10 @@ cannot_pack:
sendreq->req_send.req_base.req_count, sendreq->req_send.req_base.req_count,
sendreq->req_send.req_base.req_datatype); sendreq->req_send.req_base.req_datatype);
); );
mca_bml_base_prepare_src(bml_btl, NULL, mca_bml_base_prepare_src(bml_btl, &sendreq->req_send.req_base.req_convertor,
&sendreq->req_send.req_base.req_convertor, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t),
MCA_BTL_NO_ORDER,
sizeof(mca_pml_ob1_frag_hdr_t),
&size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK | &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
MCA_BTL_DES_FLAGS_SIGNAL, MCA_BTL_DES_FLAGS_SIGNAL, &des);
&des);
MEMCHECKER( MEMCHECKER(
memchecker_call(&opal_memchecker_base_mem_noaccess, memchecker_call(&opal_memchecker_base_mem_noaccess,
sendreq->req_send.req_base.req_addr, sendreq->req_send.req_base.req_addr,
@ -1051,12 +1011,9 @@ cannot_pack:
des->des_cbdata = sendreq; des->des_cbdata = sendreq;
/* setup header */ /* setup header */
hdr = (mca_pml_ob1_frag_hdr_t*)des->des_local->seg_addr.pval; hdr = (mca_pml_ob1_frag_hdr_t*)des->des_segments->seg_addr.pval;
hdr->hdr_common.hdr_flags = 0; mca_pml_ob1_frag_hdr_prepare (hdr, 0, range->range_send_offset, sendreq,
hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FRAG; sendreq->req_recv.lval);
hdr->hdr_frag_offset = range->range_send_offset;
hdr->hdr_src_req.pval = sendreq;
hdr->hdr_dst_req = sendreq->req_recv;
ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG, ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_FRAG,
sendreq->req_send.req_base.req_proc); sendreq->req_send.req_base.req_proc);
@ -1113,38 +1070,66 @@ cannot_pack:
} }
/**
* A put fragment could not be started. Queue the fragment to be retried later or
* fall back on send/recv.
*/
static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *frag, int rc)
{
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
/* queue the frag for later if there was a resource error */
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
/* tell receiver to deregister memory */
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
OPAL_ERR_TEMP_OUT_OF_RESOURCE);
/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if (NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule (sendreq);
}
}
/** /**
* An RDMA put operation has completed: * An RDMA put operation has completed:
* (1) Update request status and if required set completed * (1) Update request status and if required set completed
* (2) Send FIN control message to the destination * (2) Send FIN control message to the destination
*/ */
static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, static void mca_pml_ob1_put_completion (mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_endpoint_t* ep, void *local_address, mca_btl_base_registration_handle_t *local_handle,
struct mca_btl_base_descriptor_t* des, void *context, void *cbdata, int status)
int status )
{ {
mca_pml_ob1_rdma_frag_t* frag = (mca_pml_ob1_rdma_frag_t*)des->des_cbdata; mca_pml_ob1_rdma_frag_t *frag = (mca_pml_ob1_rdma_frag_t *) cbdata;
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req; mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; mca_bml_base_btl_t *bml_btl = (mca_bml_base_btl_t *) context;
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS == status) ) {
/* TSW - FIX */ /* TODO -- readd ordering */
OMPI_ERROR_LOG(status); mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
ompi_rte_abort(-1, NULL); frag->rdma_hdr.hdr_rdma.hdr_frag, frag->rdma_length,
0, 0);
/* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
send_request_pml_complete_check(sendreq);
} else {
/* try to fall back on send/recv */
mca_pml_ob1_send_request_put_frag_failed (frag, status);
} }
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_des,
des->order, 0);
/* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
send_request_pml_complete_check(sendreq);
MCA_PML_OB1_RDMA_FRAG_RETURN(frag); MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
MCA_PML_OB1_PROGRESS_PENDING(bml_btl); MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
@ -1152,81 +1137,45 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag ) int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
{ {
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)frag->rdma_req; mca_pml_ob1_send_request_t *sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_mpool_base_registration_t *reg = NULL; mca_btl_base_registration_handle_t *local_handle = NULL;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml; mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
mca_btl_base_descriptor_t *des;
size_t save_size = frag->rdma_length;
int rc; int rc;
if (OPAL_LIKELY(NULL == sendreq->src_des)) { if (bml_btl->btl->btl_register_mem && NULL == frag->local_handle) {
/* setup descriptor */ /* Check if the segment is already registered */
mca_bml_base_prepare_src( bml_btl, for (size_t i = 0 ; i < sendreq->req_rdma_cnt ; ++i) {
reg, if (sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
&frag->convertor, /* do not copy the handle to the fragment to avoid deregistring it twice */
MCA_BTL_NO_ORDER, local_handle = sendreq->req_rdma[i].btl_reg;
0, break;
&frag->rdma_length, }
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | }
MCA_BTL_DES_FLAGS_PUT,
&des );
if( OPAL_UNLIKELY(NULL == des) ) {
if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
frag->rdma_length = save_size;
opal_convertor_set_position(&frag->convertor, &offset);
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
mca_pml_ob1_send_request_t *sendreq =
(mca_pml_ob1_send_request_t*)frag->rdma_req;
/* tell receiver to unregister memory */ if (NULL == frag->local_handle) {
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc, /* Not already registered. Register the region with the BTL. */
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des, mca_bml_base_register_mem (bml_btl, frag->local_address, frag->rdma_length, 0,
MCA_BTL_NO_ORDER, 1); &frag->local_handle);
/* send fragment by copy in/out */ if (OPAL_UNLIKELY(NULL == frag->local_handle)) {
mca_pml_ob1_send_request_copy_in_out(sendreq, mca_pml_ob1_send_request_put_frag_failed (frag, OMPI_ERR_OUT_OF_RESOURCE);
frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
/* if a pointer to a receive request is not set it means that return OMPI_ERR_OUT_OF_RESOURCE;
* ACK was not yet received. Don't schedule sends before ACK */
if(NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule(sendreq);
} }
return OMPI_ERR_OUT_OF_RESOURCE; local_handle = frag->local_handle;
} }
} else {
/* already have a source descriptor */
des = sendreq->src_des;
sendreq->src_des = NULL;
} }
des->des_remote = (mca_btl_base_segment_t *) frag->rdma_segs;
des->des_remote_count = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
des->des_cbfunc = mca_pml_ob1_put_completion;
des->des_cbdata = frag;
PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND ); &(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, PERUSE_SEND );
rc = mca_bml_base_put(bml_btl, des); rc = mca_bml_base_put (bml_btl, frag->local_address, frag->remote_address, local_handle,
(mca_btl_base_registration_handle_t *) frag->remote_handle, frag->rdma_length,
0, MCA_BTL_NO_ORDER, mca_pml_ob1_put_completion, frag);
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
mca_bml_base_free(bml_btl, des); mca_pml_ob1_send_request_put_frag_failed (frag, rc);
frag->rdma_length = save_size; return rc;
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {
/* TSW - FIX */
OMPI_ERROR_LOG(rc);
ompi_rte_abort(-1, NULL);
}
} }
return OMPI_SUCCESS; return OMPI_SUCCESS;
@ -1240,12 +1189,11 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
*/ */
void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq, void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
mca_btl_base_module_t* btl, mca_btl_base_module_t* btl,
mca_pml_ob1_rdma_hdr_t* hdr ) mca_pml_ob1_rdma_hdr_t* hdr )
{ {
mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint; mca_bml_base_endpoint_t *bml_endpoint = sendreq->req_endpoint;
mca_pml_ob1_rdma_frag_t* frag; mca_pml_ob1_rdma_frag_t* frag;
size_t i, size = 0;
if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) { if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_TYPE_ACK) {
OPAL_THREAD_ADD32(&sendreq->req_state, -1); OPAL_THREAD_ADD32(&sendreq->req_state, -1);
@ -1253,61 +1201,36 @@ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq,
sendreq->req_recv.pval = hdr->hdr_recv_req.pval; sendreq->req_recv.pval = hdr->hdr_recv_req.pval;
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag); if (NULL == sendreq->rdma_frag) {
MCA_PML_OB1_RDMA_FRAG_ALLOC(frag);
if( OPAL_UNLIKELY(NULL == frag) ) { if( OPAL_UNLIKELY(NULL == frag) ) {
/* TSW - FIX */ /* TSW - FIX */
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE); OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
ompi_rte_abort(-1, NULL); ompi_rte_abort(-1, NULL);
}
assert (btl->btl_seg_size * hdr->hdr_seg_cnt <= sizeof (frag->rdma_segs));
/* setup fragment */
memcpy (frag->rdma_segs, hdr + 1, btl->btl_seg_size * hdr->hdr_seg_cnt);
for( i = 0; i < hdr->hdr_seg_cnt; i++ ) {
mca_btl_base_segment_t *seg = (mca_btl_base_segment_t *) ((uintptr_t)(frag->rdma_segs) + i * btl->btl_seg_size);
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if ((sendreq->req_send.req_base.req_proc->super.proc_arch & OPAL_ARCH_ISBIGENDIAN) !=
(ompi_proc_local()->super.proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
size += opal_swap_bytes4(seg->seg_len);
} else
#endif
{
size += seg->seg_len;
} }
} else {
/* rget fallback on put */
frag = sendreq->rdma_frag;
sendreq->rdma_frag = NULL;
sendreq->req_state = 0;
} }
/* copy registration data */
memcpy (frag->remote_handle, hdr + 1, btl->btl_registration_handle_size);
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
frag->rdma_hdr.hdr_rdma = *hdr; frag->rdma_hdr.hdr_rdma = *hdr;
frag->rdma_req = sendreq; frag->rdma_req = sendreq;
frag->rdma_ep = bml_endpoint; frag->rdma_length = hdr->hdr_dst_size;
frag->rdma_length = size;
frag->rdma_state = MCA_PML_OB1_RDMA_PUT; frag->rdma_state = MCA_PML_OB1_RDMA_PUT;
frag->reg = NULL; frag->remote_address = hdr->hdr_dst_ptr;
frag->retries = 0; frag->retries = 0;
if (OPAL_UNLIKELY(NULL != sendreq->src_des)) { /* Get the address of the current offset. Note: at this time ob1 CAN NOT handle
/* get fallback path */ * non-contiguous RDMA. If that changes this code will be wrong. */
sendreq->req_state = 0; opal_convertor_get_offset_pointer (&sendreq->req_send.req_base.req_convertor,
} hdr->hdr_rdma_offset, &frag->local_address);
/* lookup the corresponding registration */
for(i=0; i<sendreq->req_rdma_cnt; i++) {
if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
frag->reg = sendreq->req_rdma[i].btl_reg;
break;
}
}
/* RDMA writes may proceed in parallel to send and to each other, so
* create clone of the convertor for each RDMA fragment
*/
size = hdr->hdr_rdma_offset;
opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
&frag->convertor, 0, &size);
mca_pml_ob1_send_request_put_frag(frag); mca_pml_ob1_send_request_put_frag(frag);
} }

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved. * All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved. * Copyright (c) 2011-2012 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
@ -54,7 +54,7 @@ struct mca_pml_ob1_send_request_t {
mca_pml_ob1_send_pending_t req_pending; mca_pml_ob1_send_pending_t req_pending;
opal_mutex_t req_send_range_lock; opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges; opal_list_t req_send_ranges;
mca_btl_base_descriptor_t *src_des; mca_pml_ob1_rdma_frag_t *rdma_frag;
mca_pml_ob1_com_btl_t req_rdma[1]; mca_pml_ob1_com_btl_t req_rdma[1];
}; };
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
@ -124,10 +124,9 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
ompi_free_list_item_t* item; \ ompi_free_list_item_t* item; \
\ \
if( OPAL_LIKELY(NULL != proc) ) { \ if( OPAL_LIKELY(NULL != proc) ) { \
OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \ OMPI_FREE_LIST_WAIT_MT(&mca_pml_base_send_requests, item); \
sendreq = (mca_pml_ob1_send_request_t*)item; \ sendreq = (mca_pml_ob1_send_request_t*)item; \
sendreq->req_send.req_base.req_proc = proc; \ sendreq->req_send.req_base.req_proc = proc; \
sendreq->src_des = NULL; \
} \ } \
} }
@ -163,15 +162,18 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t *type)
assert( 0 == _position ); \ assert( 0 == _position ); \
} }
static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* sendreq) static inline void mca_pml_ob1_free_rdma_resources (mca_pml_ob1_send_request_t* sendreq)
{ {
size_t r; size_t r;
/* return mpool resources */ /* return mpool resources */
for(r = 0; r < sendreq->req_rdma_cnt; r++) { for(r = 0; r < sendreq->req_rdma_cnt; r++) {
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; struct mca_btl_base_registration_handle_t *handle = sendreq->req_rdma[r].btl_reg;
if( NULL != reg && reg->mpool != NULL ) { mca_bml_base_btl_t *bml_btl = sendreq->req_rdma[r].bml_btl;
reg->mpool->mpool_deregister(reg->mpool, reg);
if (NULL != handle) {
mca_bml_base_deregister_mem (bml_btl, handle);
sendreq->req_rdma[r].btl_reg = NULL;
} }
} }
sendreq->req_rdma_cnt = 0; sendreq->req_rdma_cnt = 0;
@ -218,10 +220,14 @@ do {
#define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \ #define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \
do { \ do { \
/* Let the base handle the reference counts */ \ /* Let the base handle the reference counts */ \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \ if (sendreq->rdma_frag) { \
(ompi_free_list_item_t*)sendreq); \ MCA_PML_OB1_RDMA_FRAG_RETURN (sendreq->rdma_frag); \
sendreq->rdma_frag = NULL; \
} \
OMPI_FREE_LIST_RETURN_MT( &mca_pml_base_send_requests, \
(ompi_free_list_item_t*)sendreq); \
} while(0) } while(0)