/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2007 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** * @file */ #ifndef OMPI_PML_OB1_RECV_REQUEST_H #define OMPI_PML_OB1_RECV_REQUEST_H #include "pml_ob1.h" #include "pml_ob1_rdma.h" #include "pml_ob1_rdmafrag.h" #include "ompi/proc/proc.h" #include "ompi/mca/pml/ob1/pml_ob1_comm.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/pml/base/pml_base_recvreq.h" #include "ompi/datatype/datatype.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif struct mca_pml_ob1_recv_request_t { mca_pml_base_recv_request_t req_recv; ompi_ptr_t req_send; int32_t req_lock; size_t req_pipeline_depth; size_t req_bytes_received; size_t req_bytes_delivered; size_t req_rdma_offset; size_t req_send_offset; uint32_t req_rdma_cnt; uint32_t req_rdma_idx; bool req_pending; bool req_ack_sent; /**< whether ack was sent to the sender */ bool req_match_received; /**< Prevent request to be completed prematurely */ opal_mutex_t lock; mca_pml_ob1_com_btl_t req_rdma[1]; }; typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t; OBJ_CLASS_DECLARATION(mca_pml_ob1_recv_request_t); static inline bool lock_recv_request(mca_pml_ob1_recv_request_t *recvreq) { return OPAL_THREAD_ADD32(&recvreq->req_lock, 1) == 1; } static inline bool unlock_recv_request(mca_pml_ob1_recv_request_t *recvreq) { return OPAL_THREAD_ADD32(&recvreq->req_lock, -1) == 0; } /** * Allocate a recv request from the modules free list. * * @param rc (OUT) OMPI_SUCCESS or error status on failure. * @return Receive request. */ #define MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq, rc) \ do { \ ompi_free_list_item_t* item; \ rc = OMPI_SUCCESS; \ OMPI_FREE_LIST_GET(&mca_pml_base_recv_requests, item, rc); \ recvreq = (mca_pml_ob1_recv_request_t*)item; \ } while(0) /** * Initialize a receive request with call parameters. * * @param request (IN) Receive request. * @param addr (IN) User buffer. * @param count (IN) Number of elements of indicated datatype. * @param datatype (IN) User defined datatype. * @param src (IN) Source rank w/in the communicator. * @param tag (IN) User defined tag. * @param comm (IN) Communicator. * @param persistent (IN) Is this a ersistent request. */ #define MCA_PML_OB1_RECV_REQUEST_INIT( request, \ addr, \ count, \ datatype, \ src, \ tag, \ comm, \ persistent) \ do { \ MCA_PML_BASE_RECV_REQUEST_INIT( &(request)->req_recv, \ addr, \ count, \ datatype, \ src, \ tag, \ comm, \ persistent); \ } while(0) /** * Mark the request as completed at MPI level for internal purposes. * * @param recvreq (IN) Receive request. */ #define MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE( recvreq ) \ do { \ PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \ &(recvreq->req_recv.req_base), PERUSE_RECV ); \ MCA_PML_BASE_REQUEST_MPI_COMPLETE( &(recvreq->req_recv.req_base.req_ompi) ); \ } while (0) /* * Free the PML receive request */ #define MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq) \ { \ MCA_PML_BASE_RECV_REQUEST_FINI(&(recvreq)->req_recv); \ OMPI_FREE_LIST_RETURN( &mca_pml_base_recv_requests, \ (ompi_free_list_item_t*)(recvreq)); \ } /** * Complete receive request. Request structure cannot be accessed after calling * this function any more. * * @param recvreq (IN) Receive request. */ void static inline recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq) { size_t i; assert(false == recvreq->req_recv.req_base.req_pml_complete); if(recvreq->req_recv.req_bytes_packed > 0) { PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END, &recvreq->req_recv.req_base, PERUSE_RECV ); } for(i = 0; i < recvreq->req_rdma_cnt; i++) { mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[i].btl_reg; if( NULL != btl_reg && btl_reg->mpool != NULL) { btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); } } recvreq->req_rdma_cnt = 0; OPAL_THREAD_LOCK(&ompi_request_lock); if(true == recvreq->req_recv.req_base.req_free_called) { MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq); } else { /* initialize request status */ recvreq->req_recv.req_base.req_pml_complete = true; recvreq->req_recv.req_base.req_ompi.req_status._count = (int)recvreq->req_bytes_received; if (recvreq->req_bytes_received > recvreq->req_bytes_delivered) { recvreq->req_recv.req_base.req_ompi.req_status._count = (int)recvreq->req_bytes_delivered; recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = MPI_ERR_TRUNCATE; } MCA_PML_OB1_RECV_REQUEST_MPI_COMPLETE(recvreq); } OPAL_THREAD_UNLOCK(&ompi_request_lock); } bool static inline recv_request_pml_complete_check(mca_pml_ob1_recv_request_t *recvreq) { opal_atomic_rmb(); if(recvreq->req_match_received && recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed && lock_recv_request(recvreq)) { recv_request_pml_complete(recvreq); return true; } return false; } /** * Attempt to match the request against the unexpected fragment list * for all source ranks w/in the communicator. * * @param request (IN) Request to match. */ void mca_pml_ob1_recv_request_match_wild(mca_pml_ob1_recv_request_t* request); /** * Attempt to match the request against the unexpected fragment list * for a specific source rank. * * @param request (IN) Request to match. */ void mca_pml_ob1_recv_request_match_specific(mca_pml_ob1_recv_request_t* request); /** * Initialize diagnostic code for tracing rdma protocol timing */ /** * Start an initialized request. * * @param request Receive request. * @return OMPI_SUCESS or error status on failure. */ #define MCA_PML_OB1_RECV_REQUEST_START(request) \ do { \ /* init/re-init the request */ \ (request)->req_lock = 0; \ (request)->req_pipeline_depth = 0; \ (request)->req_bytes_received = 0; \ (request)->req_bytes_delivered = 0; \ /* What about req_rdma_cnt ? */ \ (request)->req_rdma_idx = 0; \ (request)->req_pending = false; \ (request)->req_ack_sent = false; \ (request)->req_match_received = false; \ \ MCA_PML_BASE_RECV_START( &(request)->req_recv.req_base ); \ \ /* attempt to match posted recv */ \ if((request)->req_recv.req_base.req_peer == OMPI_ANY_SOURCE) { \ mca_pml_ob1_recv_request_match_wild(request); \ } else { \ (request)->req_recv.req_base.req_proc = \ (request)->req_recv.req_base.req_comm->c_pml_comm->procs \ [(request)->req_recv.req_base.req_peer].ompi_proc; \ if( (0 != (request)->req_recv.req_base.req_datatype->size) && \ (0 != (request)->req_recv.req_base.req_count) ) { \ ompi_convertor_copy_and_prepare_for_recv( \ (request)->req_recv.req_base.req_proc->proc_convertor, \ (request)->req_recv.req_base.req_datatype, \ (request)->req_recv.req_base.req_count, \ (request)->req_recv.req_base.req_addr, \ 0, \ &(request)->req_recv.req_base.req_convertor ); \ ompi_convertor_get_unpacked_size( &(request)->req_recv.req_base.req_convertor, \ &(request)->req_bytes_delivered ); \ } \ mca_pml_ob1_recv_request_match_specific(request); \ } \ } while (0) /** * */ #define MCA_PML_OB1_RECV_REQUEST_MATCHED( request, hdr ) \ do { \ (request)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = (hdr)->hdr_src; \ (request)->req_recv.req_base.req_ompi.req_status.MPI_TAG = (hdr)->hdr_tag; \ \ if((request)->req_recv.req_bytes_packed > 0) { \ if( MPI_ANY_SOURCE == (request)->req_recv.req_base.req_peer ) { \ ompi_convertor_copy_and_prepare_for_recv( \ (request)->req_recv.req_base.req_proc->proc_convertor, \ (request)->req_recv.req_base.req_datatype, \ (request)->req_recv.req_base.req_count, \ (request)->req_recv.req_base.req_addr, \ 0, \ &(request)->req_recv.req_base.req_convertor ); \ ompi_convertor_get_unpacked_size( &(request)->req_recv.req_base.req_convertor, \ &(request)->req_bytes_delivered ); \ } \ PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_XFER_BEGIN, \ &((request)->req_recv.req_base), PERUSE_RECV); \ } \ } while (0) /** * */ #define MCA_PML_OB1_RECV_REQUEST_UNPACK( \ request, \ segments, \ num_segments, \ seg_offset, \ data_offset, \ bytes_received, \ bytes_delivered) \ do { \ bytes_delivered = 0; \ if(request->req_recv.req_bytes_packed > 0) { \ struct iovec iov[MCA_BTL_DES_MAX_SEGMENTS]; \ uint32_t iov_count = 0; \ size_t max_data = bytes_received; \ size_t n, offset = seg_offset; \ mca_btl_base_segment_t* segment = segments; \ \ OPAL_THREAD_LOCK(&request->lock); \ for( n = 0; n < num_segments; n++, segment++ ) { \ if(offset >= segment->seg_len) { \ offset -= segment->seg_len; \ } else { \ iov[iov_count].iov_len = segment->seg_len - offset; \ iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + offset); \ iov_count++; \ } \ } \ PERUSE_TRACE_COMM_OMPI_EVENT (PERUSE_COMM_REQ_XFER_CONTINUE, \ &(recvreq->req_recv.req_base), max_data, \ PERUSE_RECV); \ ompi_convertor_set_position( &(request->req_recv.req_base.req_convertor), \ &data_offset ); \ ompi_convertor_unpack( &(request)->req_recv.req_base.req_convertor, \ iov, \ &iov_count, \ &max_data ); \ bytes_delivered = max_data; \ OPAL_THREAD_UNLOCK(&request->lock); \ } \ } while (0) /** * */ void mca_pml_ob1_recv_request_progress( mca_pml_ob1_recv_request_t* req, struct mca_btl_base_module_t* btl, mca_btl_base_segment_t* segments, size_t num_segments); /** * */ void mca_pml_ob1_recv_request_matched_probe( mca_pml_ob1_recv_request_t* req, struct mca_btl_base_module_t* btl, mca_btl_base_segment_t* segments, size_t num_segments); /** * */ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl); static inline int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl) { int rc; do { rc = mca_pml_ob1_recv_request_schedule_once(req, start_bml_btl); if(rc == OMPI_ERR_OUT_OF_RESOURCE) break; } while(!unlock_recv_request(req)); if(OMPI_SUCCESS == rc) recv_request_pml_complete_check(req); return rc; } static inline void mca_pml_ob1_recv_request_schedule( mca_pml_ob1_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl) { if(!lock_recv_request(req)) return; (void)mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl); } #define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \ do { \ mca_pml_ob1_pckt_pending_t *_pckt; \ int _rc; \ \ MCA_PML_OB1_PCKT_PENDING_ALLOC(_pckt,_rc); \ _pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_ACK; \ _pckt->hdr.hdr_ack.hdr_src_req.lval = (S); \ _pckt->hdr.hdr_ack.hdr_dst_req.pval = (D); \ _pckt->hdr.hdr_ack.hdr_send_offset = (O); \ _pckt->proc = (P); \ _pckt->bml_btl = NULL; \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ opal_list_append(&mca_pml_ob1.pckt_pending, \ (opal_list_item_t*)_pckt); \ OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); \ } while(0) int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_rdma_offset); static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset) { size_t i; mca_bml_base_btl_t* bml_btl; mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)proc->proc_bml; for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, hdr_dst_req, hdr_send_offset) == OMPI_SUCCESS) return OMPI_SUCCESS; } MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req, hdr_send_offset); return OMPI_ERR_OUT_OF_RESOURCE; } int mca_pml_ob1_recv_request_get_frag(mca_pml_ob1_rdma_frag_t* frag); /* This function tries to continue recvreq that stuck due to resource * unavailability. Recvreq is added to recv_pending list if scheduling of put * operation cannot be accomplished for some reason. */ void mca_pml_ob1_recv_request_process_pending(void); #if defined(c_plusplus) || defined(__cplusplus) } #endif #endif