/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** * @file */ #ifndef OMPI_PML_OB1_SEND_REQUEST_H #define OMPI_PML_OB1_SEND_REQUEST_H #include "mca/btl/btl.h" #include "mca/pml/base/pml_base_sendreq.h" #include "mca/mpool/base/base.h" #include "pml_ob1_proc.h" #include "pml_ob1_comm.h" #include "pml_ob1_hdr.h" #include "pml_ob1_rdma.h" #include "datatype/convertor.h" #include "mca/bml/bml.h" #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif struct mca_pml_ob1_send_request_t { mca_pml_base_send_request_t req_send; ompi_proc_t* req_proc; mca_bml_base_endpoint_t* req_endpoint; ompi_ptr_t req_recv; #if OMPI_HAVE_THREAD_SUPPORT volatile int32_t req_state; volatile int32_t req_lock; #else volatile int32_t req_state; volatile int32_t req_lock; #endif size_t req_pipeline_depth; size_t req_bytes_delivered; size_t req_send_offset; size_t req_rdma_offset; mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST]; uint32_t req_rdma_cnt; }; typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; OBJ_CLASS_DECLARATION(mca_pml_ob1_send_request_t); #define MCA_PML_OB1_SEND_REQUEST_ALLOC( \ comm, \ dst, \ sendreq, \ rc) \ { \ ompi_proc_t *proc = \ comm->c_pml_procs[dst]->proc_ompi; \ opal_list_item_t* item; \ \ if(NULL == proc) { \ rc = OMPI_ERR_OUT_OF_RESOURCE; \ } else { \ rc = OMPI_SUCCESS; \ OMPI_FREE_LIST_WAIT(&mca_pml_ob1.send_requests, item, rc); \ sendreq = (mca_pml_ob1_send_request_t*)item; \ sendreq->req_proc = proc; \ } \ } #define MCA_PML_OB1_SEND_REQUEST_INIT( \ sendreq, \ buf, \ count, \ datatype, \ dst, \ tag, \ comm, \ sendmode, \ persistent) \ { \ MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send, \ buf, \ count, \ datatype, \ dst, \ tag, \ comm, \ sendmode, \ persistent); \ } /** * Diagnostic output to trace rdma protocol timing */ /** * Start a send request. */ #define MCA_PML_OB1_SEND_REQUEST_START(sendreq, rc) \ do { \ mca_pml_ob1_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm; \ mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)sendreq->req_proc->proc_pml; \ mca_bml_base_btl_t* bml_btl; \ size_t size = sendreq->req_send.req_bytes_packed; \ \ if(endpoint == NULL) { \ rc = OMPI_ERR_UNREACH; \ break; \ } \ \ sendreq->req_lock = 0; \ sendreq->req_pipeline_depth = 0; \ sendreq->req_bytes_delivered = 0; \ sendreq->req_rdma_cnt = 0; \ sendreq->req_state = 0; \ sendreq->req_send_offset = 0; \ sendreq->req_send.req_base.req_pml_complete = false; \ sendreq->req_send.req_base.req_ompi.req_complete = false; \ sendreq->req_send.req_base.req_ompi.req_state = OMPI_REQUEST_ACTIVE; \ sendreq->req_send.req_base.req_ompi.req_status._cancelled = 0; \ sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32( \ &comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1); \ sendreq->req_endpoint = endpoint; \ \ /* select a btl */ \ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); \ \ /* shortcut for zero byte */ \ if(size == 0 && sendreq->req_send.req_send_mode != MCA_PML_BASE_SEND_SYNCHRONOUS) { \ mca_btl_base_descriptor_t* descriptor; \ mca_btl_base_segment_t* segment; \ mca_pml_ob1_hdr_t* hdr; \ \ /* allocate a descriptor */ \ MCA_PML_OB1_DES_ALLOC(bml_btl, descriptor, sizeof(mca_pml_ob1_match_hdr_t)); \ if(NULL == descriptor) { \ return OMPI_ERR_OUT_OF_RESOURCE; \ } \ segment = descriptor->des_src; \ \ /* build hdr */ \ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; \ hdr->hdr_common.hdr_flags = 0; \ hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; \ hdr->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; \ hdr->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; \ hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag; \ hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence; \ \ /* short message */ \ descriptor->des_cbfunc = mca_pml_ob1_match_completion_cache; \ descriptor->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY; \ descriptor->des_cbdata = sendreq; \ \ /* request is complete at mpi level */ \ OPAL_THREAD_LOCK(&ompi_request_lock); \ MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq); \ OPAL_THREAD_UNLOCK(&ompi_request_lock); \ \ /* send */ \ rc = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_PML); \ if(OMPI_SUCCESS != rc) { \ mca_bml_base_free(bml_btl, descriptor ); \ } \ \ } else { \ size_t eager_limit = bml_btl->btl_eager_limit - sizeof(mca_pml_ob1_hdr_t); \ if(size <= eager_limit) { \ switch(sendreq->req_send.req_send_mode) { \ case MCA_PML_BASE_SEND_SYNCHRONOUS: \ rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0); \ break; \ case MCA_PML_BASE_SEND_BUFFERED: \ rc = mca_pml_ob1_send_request_start_copy(sendreq, bml_btl, size); \ break; \ default: \ if (bml_btl->btl_flags & MCA_BTL_FLAGS_SEND_INPLACE) { \ rc = mca_pml_ob1_send_request_start_prepare(sendreq, bml_btl, size); \ } else { \ rc = mca_pml_ob1_send_request_start_copy(sendreq, bml_btl, size); \ } \ break; \ } \ } else { \ size = eager_limit; \ if(sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) { \ rc = mca_pml_ob1_send_request_start_buffered(sendreq, bml_btl, size); \ } else if \ (ompi_convertor_need_buffers(&sendreq->req_send.req_convertor) == false) { \ if( 0 != (sendreq->req_rdma_cnt = mca_pml_ob1_rdma_btls( \ sendreq->req_endpoint, \ sendreq->req_send.req_addr, \ sendreq->req_send.req_bytes_packed, \ sendreq->req_rdma))) { \ rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl, size); \ } else { \ rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, MCA_PML_OB1_HDR_FLAGS_CONTIG); \ } \ } else { \ rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size, 0); \ } \ } \ } \ } while (0) /* * Mark a send request as completed at the MPI level. */ #define MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq) \ do { \ (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \ (sendreq)->req_send.req_base.req_comm->c_my_rank; \ (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \ (sendreq)->req_send.req_base.req_tag; \ (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \ (sendreq)->req_send.req_base.req_ompi.req_status._count = \ (sendreq)->req_send.req_bytes_packed; \ (sendreq)->req_send.req_base.req_ompi.req_complete = true; \ ompi_request_completed++; \ if(ompi_request_waiting) { \ opal_condition_broadcast(&ompi_request_cond); \ } \ } while(0) /* * The PML has completed a send request. Note that this request * may have been orphaned by the user or have already completed * at the MPI level. */ #define MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq) \ do { \ size_t r; \ /* request completed at pml level */ \ (sendreq)->req_send.req_base.req_pml_complete = true; \ \ /* return mpool resources */ \ for(r=0; r<sendreq->req_rdma_cnt; r++) { \ mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg; \ if( NULL != reg ) { \ reg->mpool->mpool_release(reg->mpool, reg); \ } \ } \ sendreq->req_rdma_cnt = 0; \ \ /* user has already released the request so simply free it */ \ if((sendreq)->req_send.req_base.req_free_called) { \ /* if buffered send - release any resources */ \ if ((sendreq)->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && \ (sendreq)->req_send.req_addr != (sendreq)->req_send.req_base.req_addr) { \ mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq); \ } \ MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq); \ /* is request complete at mpi level */ \ } else if ((sendreq)->req_send.req_base.req_ompi.req_complete == false) { \ MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq); \ /* buffered send - release any resources */ \ } else if ((sendreq)->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && \ (sendreq)->req_send.req_addr != (sendreq)->req_send.req_base.req_addr) { \ mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq); \ } \ } while (0) /* * Advance a pending send request. Note that the initial descriptor must complete * and the acknowledment received before the request can complete or be scheduled. * However, these events may occur in either order. */ #define MCA_PML_OB1_SEND_REQUEST_ADVANCE(sendreq) \ do { \ bool schedule = false; \ \ /* has an acknowledgment been received */ \ if(OPAL_THREAD_ADD32(&sendreq->req_state, 1) == 2) { \ OPAL_THREAD_LOCK(&ompi_request_lock); \ if(sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) { \ MCA_PML_OB1_SEND_REQUEST_PML_COMPLETE(sendreq); \ } else { \ schedule = true; \ } \ OPAL_THREAD_UNLOCK(&ompi_request_lock); \ } \ \ /* additional data to schedule */ \ if(schedule == true) { \ mca_pml_ob1_send_request_schedule(sendreq); \ } \ } while (0) /* * Release resources associated with a request */ #define MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq) \ { \ /* Let the base handle the reference counts */ \ MCA_PML_BASE_SEND_REQUEST_FINI((&(sendreq)->req_send)); \ OMPI_FREE_LIST_RETURN( \ &mca_pml_ob1.send_requests, (opal_list_item_t*)sendreq); \ } /* * Update bytes delivered on request based on supplied descriptor */ #define MCA_PML_OB1_SEND_REQUEST_SET_BYTES_DELIVERED(sendreq, descriptor, hdrlen) \ do { \ size_t i; \ mca_btl_base_segment_t* segments = descriptor->des_src; \ \ for(i=0; i<descriptor->des_src_cnt; i++) { \ sendreq->req_bytes_delivered += segments[i].seg_len; \ } \ sendreq->req_bytes_delivered -= hdrlen; \ \ } while(0) /* * Attempt to process any pending requests */ #define MCA_PML_OB1_SEND_REQUEST_PROCESS_PENDING() \ do { \ /* advance pending requests */ \ while(opal_list_get_size(&mca_pml_ob1.send_pending)) { \ mca_pml_ob1_send_request_t* sendreq; \ OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \ sendreq = (mca_pml_ob1_send_request_t*) \ opal_list_remove_first(&mca_pml_ob1.send_pending); \ OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); \ if(NULL == sendreq) \ break; \ mca_pml_ob1_send_request_schedule(sendreq); \ } \ } while (0) /** * Start the specified request */ int mca_pml_ob1_send_request_start_buffered( mca_pml_ob1_send_request_t* sendreq, mca_bml_base_btl_t* bml_btl, size_t size); int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, mca_bml_base_btl_t* bml_btl, size_t size); int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, mca_bml_base_btl_t* bml_btl, size_t size); int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq, mca_bml_base_btl_t* bml_btl, size_t size); int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq, mca_bml_base_btl_t* bml_btl, size_t size, int flags); /** * Schedule additional fragments */ int mca_pml_ob1_send_request_schedule( mca_pml_ob1_send_request_t* sendreq); /** * Completion callback on match header * Cache descriptor. */ void mca_pml_ob1_match_completion_cache( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, struct mca_btl_base_descriptor_t* descriptor, int status); /** * Completion callback on match header * Free descriptor. */ void mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, struct mca_btl_base_descriptor_t* descriptor, int status); /** * Initiate a put scheduled by the receiver. */ void mca_pml_ob1_send_request_put( mca_pml_ob1_send_request_t* sendreq, mca_btl_base_module_t* btl, mca_pml_ob1_rdma_hdr_t* hdr); #if defined(c_plusplus) || defined(__cplusplus) } #endif #endif