1
1
openmpi/ompi/mca/pml/ob1/pml_ob1_sendreq.h
Gleb Natapov 54b40aef91 Schedule SEND traffic of pipeline protocol between BTLs in accordance with
relative bandwidths of each BTL. Precalculate what part of a message should
be send via each BTL in advance instead of doing it during scheduling.

This commit was SVN r15248.
2007-07-01 11:34:23 +00:00

414 строки
17 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OMPI_PML_OB1_SEND_REQUEST_H
#define OMPI_PML_OB1_SEND_REQUEST_H
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/pml/base/pml_base_sendreq.h"
#include "ompi/mca/mpool/base/base.h"
#include "pml_ob1_comm.h"
#include "pml_ob1_hdr.h"
#include "pml_ob1_rdma.h"
#include "pml_ob1_rdmafrag.h"
#include "ompi/datatype/convertor.h"
#include "ompi/datatype/dt_arch.h"
#include "ompi/mca/bml/bml.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
typedef enum {
MCA_PML_OB1_SEND_PENDING_NONE,
MCA_PML_OB1_SEND_PENDING_SCHEDULE,
MCA_PML_OB1_SEND_PENDING_START
} mca_pml_ob1_send_pending_t;
struct mca_pml_ob1_send_request_t {
mca_pml_base_send_request_t req_send;
mca_bml_base_endpoint_t* req_endpoint;
ompi_ptr_t req_recv;
#if OMPI_HAVE_THREAD_SUPPORT
volatile int32_t req_state;
volatile int32_t req_lock;
#else
int32_t req_state;
int32_t req_lock;
#endif
bool req_throttle_sends;
size_t req_pipeline_depth;
size_t req_bytes_delivered;
uint32_t req_rdma_cnt;
mca_pml_ob1_send_pending_t req_pending;
opal_mutex_t req_send_range_lock;
opal_list_t req_send_ranges;
mca_pml_ob1_com_btl_t req_rdma[1];
};
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_send_request_t);
struct mca_pml_ob1_send_range_t {
ompi_free_list_item_t base;
uint64_t range_send_offset;
uint64_t range_send_length;
int range_btl_idx;
int range_btl_cnt;
mca_pml_ob1_com_btl_t range_btls[1];
};
typedef struct mca_pml_ob1_send_range_t mca_pml_ob1_send_range_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_send_range_t);
#define MCA_PML_OB1_SEND_REQUEST_ALLOC( \
comm, \
dst, \
sendreq, \
rc) \
{ \
ompi_proc_t *proc = ompi_comm_peer_lookup( comm, dst ); \
ompi_free_list_item_t* item; \
\
if(NULL == proc) { \
rc = OMPI_ERR_OUT_OF_RESOURCE; \
} else { \
rc = OMPI_SUCCESS; \
OMPI_FREE_LIST_WAIT(&mca_pml_ob1.send_requests, item, rc); \
sendreq = (mca_pml_ob1_send_request_t*)item; \
sendreq->req_send.req_base.req_proc = proc; \
} \
}
#define MCA_PML_OB1_SEND_REQUEST_INIT( \
sendreq, \
buf, \
count, \
datatype, \
dst, \
tag, \
comm, \
sendmode, \
persistent) \
{ \
MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \
buf, \
count, \
datatype, \
dst, \
tag, \
comm, \
sendmode, \
persistent); \
}
static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* sendreq)
{
size_t r;
/* return mpool resources */
for(r = 0; r < sendreq->req_rdma_cnt; r++) {
mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
if( NULL != reg && reg->mpool != NULL ) {
reg->mpool->mpool_deregister(reg->mpool, reg);
}
}
sendreq->req_rdma_cnt = 0;
}
/**
* Start a send request.
*/
#define MCA_PML_OB1_SEND_REQUEST_START(sendreq, rc) \
do { \
rc = mca_pml_ob1_send_request_start(sendreq); \
} while (0)
/*
* Mark a send request as completed at the MPI level.
*/
#define MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq) \
do { \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \
(sendreq)->req_send.req_base.req_comm->c_my_rank; \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \
(sendreq)->req_send.req_base.req_tag; \
(sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \
(sendreq)->req_send.req_base.req_ompi.req_status._count = \
(int)(sendreq)->req_send.req_bytes_packed; \
MCA_PML_BASE_REQUEST_MPI_COMPLETE( &((sendreq)->req_send.req_base.req_ompi) ); \
\
/* Could be moved to MCA_PML_BASE_REQUEST_MPI_COMPLETE, but before broadcast */ \
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \
&(sendreq->req_send.req_base), PERUSE_SEND); \
} while(0)
/*
* The PML has completed a send request. Note that this request
* may have been orphaned by the user or have already completed
* at the MPI level.
* This macro will never be called directly from the upper level, as it should
* only be an internal call to the PML.
*/
#define MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq) \
do { \
assert( false == sendreq->req_send.req_base.req_pml_complete ); \
\
if( sendreq->req_send.req_bytes_packed > 0 ) { \
PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END, \
&(sendreq->req_send.req_base), \
PERUSE_SEND ); \
} \
\
/* return mpool resources */ \
mca_pml_ob1_free_rdma_resources(sendreq); \
\
if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && \
sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) { \
mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq); \
} \
\
OPAL_THREAD_LOCK(&ompi_request_lock); \
if( false == sendreq->req_send.req_base.req_ompi.req_complete ) { \
/* Should only be called for long messages (maybe synchronous) */ \
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq); \
} \
sendreq->req_send.req_base.req_pml_complete = true; \
\
if( sendreq->req_send.req_base.req_free_called ) { \
MCA_PML_OB1_SEND_REQUEST_RETURN( sendreq ); \
} \
OPAL_THREAD_UNLOCK(&ompi_request_lock); \
} while (0)
/**
* Schedule additional fragments
*/
int mca_pml_ob1_send_request_schedule_exclusive(
mca_pml_ob1_send_request_t* sendreq);
static inline void mca_pml_ob1_send_request_schedule(
mca_pml_ob1_send_request_t* sendreq)
{
/*
* Only allow one thread in this routine for a given request.
* However, we cannot block callers on a mutex, so simply keep track
* of the number of times the routine has been called and run through
* the scheduling logic once for every call.
*/
if(OPAL_THREAD_ADD32(&sendreq->req_lock, 1) == 1)
mca_pml_ob1_send_request_schedule_exclusive(sendreq);
}
/*
* Release resources associated with a request
*/
#define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \
{ \
/* Let the base handle the reference counts */ \
MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \
OMPI_FREE_LIST_RETURN( \
&mca_pml_ob1.send_requests, (ompi_free_list_item_t*)sendreq); \
}
/**
* Start the specified request
*/
int mca_pml_ob1_send_request_start_buffered(
mca_pml_ob1_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_ob1_send_request_start_copy(
mca_pml_ob1_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_ob1_send_request_start_prepare(
mca_pml_ob1_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_ob1_send_request_start_rdma(
mca_pml_ob1_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size);
int mca_pml_ob1_send_request_start_rndv(
mca_pml_ob1_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl,
size_t size,
int flags);
static inline int mca_pml_ob1_send_request_start_btl(
mca_pml_ob1_send_request_t* sendreq,
mca_bml_base_btl_t* bml_btl)
{
size_t size = sendreq->req_send.req_bytes_packed;
size_t eager_limit = bml_btl->btl_eager_limit - sizeof(mca_pml_ob1_hdr_t);
int rc;
if(size <= eager_limit) {
switch(sendreq->req_send.req_send_mode) {
case MCA_PML_BASE_SEND_SYNCHRONOUS:
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
break;
case MCA_PML_BASE_SEND_BUFFERED:
rc = mca_pml_ob1_send_request_start_copy(sendreq, bml_btl, size);
break;
case MCA_PML_BASE_SEND_COMPLETE:
rc = mca_pml_ob1_send_request_start_prepare(sendreq, bml_btl, size);
break;
default:
if (size != 0 && bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) {
rc = mca_pml_ob1_send_request_start_prepare(sendreq, bml_btl, size);
} else {
rc = mca_pml_ob1_send_request_start_copy(sendreq, bml_btl, size);
}
break;
}
} else {
size = eager_limit;
if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) {
rc = mca_pml_ob1_send_request_start_buffered(sendreq, bml_btl, size);
} else if
(ompi_convertor_need_buffers(&sendreq->req_send.req_convertor) == false) {
unsigned char *base;
ompi_convertor_get_current_pointer( &sendreq->req_send.req_convertor, (void**)&base );
if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_btls(
sendreq->req_endpoint,
base,
sendreq->req_send.req_bytes_packed,
sendreq->req_rdma))) {
rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
sendreq->req_send.req_bytes_packed);
if(OMPI_SUCCESS != rc) {
mca_pml_ob1_free_rdma_resources(sendreq);
}
} else {
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
MCA_PML_OB1_HDR_FLAGS_CONTIG);
}
} else {
rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0);
}
}
return rc;
}
static inline int mca_pml_ob1_send_request_start(
mca_pml_ob1_send_request_t* sendreq)
{
mca_pml_ob1_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm;
mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
sendreq->req_send.req_base.req_proc->proc_bml;
size_t i;
if(endpoint == NULL) {
return OMPI_ERR_UNREACH;
}
sendreq->req_endpoint = endpoint;
sendreq->req_state = 0;
sendreq->req_lock=0 ;
sendreq->req_pipeline_depth = 0;
sendreq->req_bytes_delivered = 0;
sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_NONE;
sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32(
&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
mca_bml_base_btl_t* bml_btl;
int rc;
/* select a btl */
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl);
if(OMPI_ERR_OUT_OF_RESOURCE != rc)
return rc;
}
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_START;
opal_list_append(&mca_pml_ob1.send_pending, (opal_list_item_t*)sendreq);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_SUCCESS;
}
/**
* Completion callback on match header
* Cache descriptor.
*/
void mca_pml_ob1_match_completion_cache(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* descriptor,
int status);
/**
* Completion callback on match header
* Free descriptor.
*/
void mca_pml_ob1_match_completion_free(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* ep,
struct mca_btl_base_descriptor_t* descriptor,
int status);
/**
* Initiate a put scheduled by the receiver.
*/
void mca_pml_ob1_send_request_put(
mca_pml_ob1_send_request_t* sendreq,
mca_btl_base_module_t* btl,
mca_pml_ob1_rdma_hdr_t* hdr);
int mca_pml_ob1_send_request_put_frag(mca_pml_ob1_rdma_frag_t* frag);
/* This function tries to continue sendreq that was stuck because of resource
* unavailability. A sendreq may be added to send_pending list if there is no
* resource to send initial packet or there is not resource to schedule data
* for sending. The reason the sendreq was added to the list is stored inside
* sendreq struct and appropriate operation is retried when resource became
* available. bml_btl passed to the function doesn't represents sendreq
* destination, it represents BTL on which resource was freed, so only this BTL
* should be considered for sending packets */
void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl);
void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
uint64_t send_offset, uint64_t send_length);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif