afbe3e99c6
(OMPI_ERR_* = OPAL_SOS_GET_ERR_CODE(ret)), since the return value could be a SOS-encoded error. The OPAL_SOS_GET_ERR_CODE() takes in a SOS error and returns back the native error code. * Since OPAL_SUCCESS is preserved by SOS, also change all calls of the form (OPAL_ERROR == ret) to (OPAL_SUCCESS != ret). We thus avoid having to decode 'ret' to get the native error code. This commit was SVN r23162.
1172 строки
45 KiB
C
1172 строки
45 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2006 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Mellanox Technologies.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
|
|
|
|
#include "ompi_config.h"
|
|
#include "opal/util/crc.h"
|
|
#include "ompi/constants.h"
|
|
#include "ompi/mca/pml/pml.h"
|
|
#include "ompi/mca/btl/btl.h"
|
|
#include "ompi/class/ompi_seq_tracker.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "pml_dr.h"
|
|
#include "pml_dr_hdr.h"
|
|
#include "pml_dr_sendreq.h"
|
|
#include "pml_dr_recvreq.h"
|
|
#include "ompi/mca/bml/base/base.h"
|
|
|
|
|
|
/*
|
|
* The free call mark the final stage in a request life-cycle. Starting from this
|
|
* point the request is completed at both PML and user level, and can be used
|
|
* for others p2p communications. Therefore, in the case of the DR PML it should
|
|
* be added to the free request list.
|
|
*/
|
|
static int mca_pml_dr_send_request_free(struct ompi_request_t** request)
|
|
{
|
|
mca_pml_dr_send_request_t* sendreq = *(mca_pml_dr_send_request_t**)request;
|
|
|
|
assert( false == sendreq->req_send.req_base.req_free_called );
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
sendreq->req_send.req_base.req_free_called = true;
|
|
if( true == sendreq->req_send.req_base.req_pml_complete ) {
|
|
MCA_PML_DR_SEND_REQUEST_RETURN( sendreq );
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
*request = MPI_REQUEST_NULL;
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static int mca_pml_dr_send_request_cancel(struct ompi_request_t* request, int complete)
|
|
{
|
|
/* we dont cancel send requests by now */
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static void mca_pml_dr_send_request_construct(mca_pml_dr_send_request_t* req)
|
|
{
|
|
OBJ_CONSTRUCT(&req->req_vfrag0, mca_pml_dr_vfrag_t);
|
|
OBJ_CONSTRUCT(&req->req_retrans, opal_list_t);
|
|
|
|
req->req_vfrag0.vf_len = 1;
|
|
req->req_vfrag0.vf_mask = 1;
|
|
req->req_vfrag0.vf_send.pval = req;
|
|
req->req_send.req_base.req_type = MCA_PML_REQUEST_SEND;
|
|
req->req_send.req_base.req_ompi.req_free = mca_pml_dr_send_request_free;
|
|
req->req_send.req_base.req_ompi.req_cancel = mca_pml_dr_send_request_cancel;
|
|
}
|
|
|
|
static void mca_pml_dr_send_request_destruct(mca_pml_dr_send_request_t* req)
|
|
{
|
|
OBJ_DESTRUCT(&req->req_vfrag0);
|
|
OBJ_DESTRUCT(&req->req_retrans);
|
|
}
|
|
|
|
|
|
OBJ_CLASS_INSTANCE(
|
|
mca_pml_dr_send_request_t,
|
|
mca_pml_base_send_request_t,
|
|
mca_pml_dr_send_request_construct,
|
|
mca_pml_dr_send_request_destruct);
|
|
|
|
/**
|
|
* Handle error status on local completion
|
|
*/
|
|
|
|
static void mca_pml_dr_error_completion(
|
|
struct mca_btl_base_module_t* btl,
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
int status)
|
|
{
|
|
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata;
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
|
|
|
switch(OPAL_SOS_GET_ERROR_CODE(status)) {
|
|
case OMPI_ERR_UNREACH:
|
|
case OMPI_ERR_CONNECTION_FAILED:
|
|
case OMPI_ERR_CONNECTION_REFUSED:
|
|
/**
|
|
* peer is no longer reachable through this btl
|
|
*/
|
|
mca_bml.bml_del_proc_btl(sendreq->req_proc->ompi_proc, btl);
|
|
break;
|
|
|
|
case OMPI_ERR_FATAL:
|
|
case OMPI_ERR_COMM_FAILURE:
|
|
/**
|
|
* btl is no longer available
|
|
*/
|
|
mca_bml.bml_del_btl(btl);
|
|
break;
|
|
default:
|
|
orte_errmgr.abort(-1, NULL);
|
|
break;
|
|
}
|
|
|
|
/* update pending counts */
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1);
|
|
OPAL_THREAD_ADD64(&vfrag->vf_pending,-1);
|
|
|
|
/* reset vfrag state - select new BTL */
|
|
mca_pml_dr_vfrag_reset(vfrag);
|
|
|
|
/* reschedule vfrag */
|
|
mca_pml_dr_vfrag_reschedule(vfrag);
|
|
}
|
|
|
|
/**
|
|
* Completion of a short message - nothing left to schedule.
|
|
*/
|
|
|
|
static void mca_pml_dr_match_completion(
|
|
struct mca_btl_base_module_t* btl,
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
int status)
|
|
{
|
|
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata;
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
|
|
|
/* kill pending wdog timer */
|
|
MCA_PML_DR_VFRAG_WDOG_STOP(vfrag);
|
|
|
|
/* free any descriptor used to retransmit */
|
|
if(descriptor != sendreq->req_descriptor) {
|
|
mca_bml_base_free( (mca_bml_base_btl_t*)descriptor->des_context, descriptor);
|
|
}
|
|
|
|
/* check completion status */
|
|
if(OMPI_SUCCESS != status) {
|
|
mca_pml_dr_error_completion(btl,ep,descriptor,status);
|
|
return;
|
|
}
|
|
|
|
/* wait for local completion */
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1);
|
|
if(OPAL_THREAD_ADD64(&vfrag->vf_pending,-1) > 0)
|
|
return;
|
|
|
|
/* wait for positive ack to complete request */
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
if(vfrag->vf_ack == vfrag->vf_mask) {
|
|
|
|
/* return descriptor */
|
|
if(NULL != sendreq->req_descriptor) {
|
|
if(NULL != sendreq->req_descriptor->des_context) {
|
|
mca_bml_base_free((mca_bml_base_btl_t*)sendreq->req_descriptor->des_context, sendreq->req_descriptor);
|
|
}
|
|
sendreq->req_descriptor = NULL;
|
|
}
|
|
|
|
/* update statistics and complete */
|
|
sendreq->req_bytes_delivered = sendreq->req_send.req_bytes_packed;
|
|
MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n",
|
|
__FILE__, __LINE__, vfrag->vf_id));
|
|
|
|
ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id);
|
|
MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq);
|
|
|
|
/* on negative ack need to retransmit */
|
|
} else if(vfrag->vf_state & MCA_PML_DR_VFRAG_NACKED) {
|
|
MCA_PML_DR_VFRAG_WDOG_START(vfrag);
|
|
MCA_PML_DR_SEND_REQUEST_EAGER_RETRY(sendreq, vfrag);
|
|
|
|
/* start ack timer */
|
|
} else {
|
|
MCA_PML_DR_VFRAG_ACK_START(vfrag);
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
/* check for pending requests */
|
|
MCA_PML_DR_SEND_REQUEST_PROCESS_PENDING();
|
|
}
|
|
|
|
/*
|
|
* Completion of the first fragment of a long message that
|
|
* requires an acknowledgement
|
|
*/
|
|
|
|
static void mca_pml_dr_rndv_completion(
|
|
mca_btl_base_module_t* btl,
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
int status)
|
|
{
|
|
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata;
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
|
bool schedule = false;
|
|
|
|
/* kill pending wdog timer */
|
|
MCA_PML_DR_VFRAG_WDOG_STOP(vfrag);
|
|
|
|
/* free any descriptor used to retransmit */
|
|
if(descriptor != sendreq->req_descriptor) {
|
|
mca_bml_base_free( (mca_bml_base_btl_t*)descriptor->des_context, descriptor );
|
|
}
|
|
|
|
/* check completion status */
|
|
if(OMPI_SUCCESS != status) {
|
|
mca_pml_dr_error_completion(btl,ep,descriptor,status);
|
|
return;
|
|
}
|
|
|
|
/* local completion */
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1);
|
|
if(OPAL_THREAD_ADD64(&vfrag->vf_pending,-1) > 0)
|
|
return;
|
|
|
|
/* wait for positive ack to complete request */
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
if(vfrag->vf_ack == vfrag->vf_mask) {
|
|
if(sendreq->req_descriptor) {
|
|
if(NULL != sendreq->req_descriptor->des_context) {
|
|
mca_bml_base_free((mca_bml_base_btl_t*)sendreq->req_descriptor->des_context, sendreq->req_descriptor);
|
|
}
|
|
sendreq->req_descriptor = NULL;
|
|
}
|
|
|
|
/* update statistics and complete */
|
|
MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n",
|
|
__FILE__, __LINE__, vfrag->vf_id));
|
|
|
|
ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id);
|
|
if(sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) {
|
|
MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq);
|
|
} else {
|
|
schedule = true;
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
if(schedule) {
|
|
mca_pml_dr_send_request_schedule(sendreq);
|
|
}
|
|
|
|
/* on negative ack need to retransmit */
|
|
} else if(vfrag->vf_state & MCA_PML_DR_VFRAG_NACKED) {
|
|
MCA_PML_DR_VFRAG_ACK_RESET(vfrag);
|
|
MCA_PML_DR_SEND_REQUEST_RNDV_PROBE(sendreq, vfrag);
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
/* setup ack timer */
|
|
} else {
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
MCA_PML_DR_VFRAG_ACK_START(vfrag);
|
|
}
|
|
|
|
/* check for pending requests */
|
|
MCA_PML_DR_SEND_REQUEST_PROCESS_PENDING();
|
|
}
|
|
|
|
|
|
/**
|
|
* Completion of additional fragments of a large message - may need
|
|
* to schedule additional fragments.
|
|
*/
|
|
|
|
static void mca_pml_dr_frag_completion(
|
|
mca_btl_base_module_t* btl,
|
|
struct mca_btl_base_endpoint_t* ep,
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
int status)
|
|
{
|
|
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)descriptor->des_cbdata;
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
|
mca_bml_base_btl_t* bml_btl = vfrag->bml_btl;
|
|
bool schedule = false;
|
|
|
|
/* check completion status */
|
|
if(OMPI_SUCCESS != status) {
|
|
mca_pml_dr_error_completion(btl,ep,descriptor,status);
|
|
return;
|
|
}
|
|
|
|
/* have all pending frags completed for this vfrag? */
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
vfrag->vf_pending -= 1;
|
|
if(vfrag->vf_pending == 0) {
|
|
MCA_PML_DR_VFRAG_WDOG_STOP(vfrag);
|
|
|
|
/* has the vfrag already been acked */
|
|
if (vfrag->vf_ack == vfrag->vf_mask) {
|
|
|
|
sendreq->req_bytes_delivered += vfrag->vf_size;
|
|
assert(sendreq->req_bytes_delivered <= sendreq->req_send.req_bytes_packed);
|
|
|
|
/* is this vfrag in the process of being retransmitted */
|
|
if(vfrag->vf_state & MCA_PML_DR_VFRAG_RETRANS) {
|
|
opal_list_remove_item(&sendreq->req_retrans, (opal_list_item_t*)vfrag);
|
|
vfrag->vf_state &= ~MCA_PML_DR_VFRAG_RETRANS;
|
|
}
|
|
|
|
/* record vfrag id to drop duplicate acks */
|
|
MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n",
|
|
__FILE__, __LINE__, vfrag->vf_id));
|
|
|
|
ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id);
|
|
|
|
/* return this vfrag */
|
|
MCA_PML_DR_VFRAG_RETURN(vfrag);
|
|
|
|
/* waiting on ack */
|
|
} else if (vfrag->vf_idx == vfrag->vf_len) {
|
|
MCA_PML_DR_VFRAG_ACK_START(vfrag);
|
|
}
|
|
|
|
/* if not reset the watchdog timer */
|
|
} else {
|
|
MCA_PML_DR_VFRAG_WDOG_RESET(vfrag);
|
|
}
|
|
|
|
/* are we done with this request ? */
|
|
if(OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1) == 0 &&
|
|
sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) {
|
|
MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq);
|
|
} else if (sendreq->req_send_offset < sendreq->req_send.req_bytes_packed ||
|
|
opal_list_get_size(&sendreq->req_retrans)) {
|
|
schedule = true;
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
/* return the descriptor */
|
|
mca_bml_base_free(bml_btl, descriptor);
|
|
|
|
/* schedule remainder of message? */
|
|
if(schedule) {
|
|
mca_pml_dr_send_request_schedule(sendreq);
|
|
}
|
|
|
|
/* check for pending requests */
|
|
MCA_PML_DR_SEND_REQUEST_PROCESS_PENDING();
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* Buffer the entire message and mark as complete.
|
|
*/
|
|
|
|
int mca_pml_dr_send_request_start_buffered(
|
|
mca_pml_dr_send_request_t* sendreq,
|
|
mca_bml_base_btl_t* bml_btl,
|
|
size_t size)
|
|
{
|
|
mca_btl_base_descriptor_t* descriptor;
|
|
mca_btl_base_segment_t* segment;
|
|
mca_pml_dr_hdr_t* hdr;
|
|
struct iovec iov;
|
|
unsigned int iov_count;
|
|
size_t max_data;
|
|
int rc;
|
|
uint32_t csum;
|
|
bool do_csum = mca_pml_dr.enable_csum &&
|
|
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
|
|
|
/* allocate descriptor */
|
|
mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER,
|
|
sizeof(mca_pml_dr_rendezvous_hdr_t) + size,
|
|
MCA_BTL_DES_FLAGS_PRIORITY);
|
|
if(NULL == descriptor) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
sendreq->req_descriptor = descriptor; /* hang on to this for later */
|
|
segment = descriptor->des_src;
|
|
|
|
/* pack the data into the BTL supplied buffer */
|
|
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
|
|
sizeof(mca_pml_dr_rendezvous_hdr_t));
|
|
iov.iov_len = size;
|
|
iov_count = 1;
|
|
max_data = size;
|
|
if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor,
|
|
&iov,
|
|
&iov_count,
|
|
&max_data)) < 0) {
|
|
mca_bml_base_free(bml_btl, descriptor);
|
|
return rc;
|
|
}
|
|
csum = sendreq->req_send.req_base.req_convertor.checksum;
|
|
|
|
/* update lengths */
|
|
segment->seg_len = sizeof(mca_pml_dr_rendezvous_hdr_t) + max_data;
|
|
sendreq->req_vfrag0.vf_size = max_data;
|
|
sendreq->req_vfrag0.bml_btl = bml_btl;
|
|
sendreq->req_vfrag0.vf_state |= MCA_PML_DR_VFRAG_RNDV;
|
|
sendreq->req_vfrag0.vf_pending = 1;
|
|
|
|
descriptor->des_cbfunc = mca_pml_dr_rndv_completion;
|
|
descriptor->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
|
|
descriptor->des_cbdata = &sendreq->req_vfrag0;
|
|
|
|
/* buffer the remainder of the message */
|
|
rc = mca_pml_base_bsend_request_alloc((ompi_request_t*)sendreq);
|
|
if(OMPI_SUCCESS != rc) {
|
|
mca_bml_base_free(bml_btl, descriptor);
|
|
return rc;
|
|
}
|
|
|
|
iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)sendreq->req_send.req_addr) + max_data);
|
|
iov.iov_len = sendreq->req_send.req_bytes_packed - max_data;
|
|
|
|
max_data = iov.iov_len;
|
|
if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor,
|
|
&iov,
|
|
&iov_count,
|
|
&max_data)) < 0) {
|
|
mca_bml_base_free(bml_btl, descriptor);
|
|
return rc;
|
|
}
|
|
|
|
/* build rendezvous header */
|
|
hdr = (mca_pml_dr_hdr_t*)segment->seg_addr.pval;
|
|
hdr->hdr_common.hdr_flags = 0;
|
|
hdr->hdr_common.hdr_csum = 0;
|
|
hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_RNDV;
|
|
hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst;
|
|
hdr->hdr_common.hdr_src = sendreq->req_endpoint->src;
|
|
hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id;
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
|
hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence;
|
|
hdr->hdr_match.hdr_csum = csum;
|
|
hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0;
|
|
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
|
hdr->hdr_common.hdr_csum = (do_csum ?
|
|
opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t)) :
|
|
OPAL_CSUM_ZERO);
|
|
|
|
/* re-init convertor for packed data (it keep the flags) */
|
|
opal_convertor_prepare_for_send( &sendreq->req_send.req_base.req_convertor,
|
|
&(ompi_mpi_byte.dt.super),
|
|
sendreq->req_send.req_bytes_packed,
|
|
sendreq->req_send.req_addr );
|
|
|
|
/* request is complete at mpi level */
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
MCA_PML_DR_SEND_REQUEST_MPI_COMPLETE(sendreq);
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
/* send */
|
|
MCA_PML_DR_VFRAG_WDOG_START(&sendreq->req_vfrag0);
|
|
rc = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_PML);
|
|
if(OMPI_SUCCESS != rc) {
|
|
mca_bml_base_free(bml_btl, descriptor );
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
|
|
/**
|
|
* BTL requires "specially" allocated memory. Request a segment that
|
|
* is used for initial hdr and any eager data.
|
|
*/
|
|
|
|
int mca_pml_dr_send_request_start_copy(
|
|
mca_pml_dr_send_request_t* sendreq,
|
|
mca_bml_base_btl_t* bml_btl,
|
|
size_t size)
|
|
{
|
|
mca_btl_base_descriptor_t* descriptor;
|
|
mca_btl_base_segment_t* segment;
|
|
mca_pml_dr_hdr_t* hdr;
|
|
struct iovec iov;
|
|
unsigned int iov_count;
|
|
size_t max_data;
|
|
int rc;
|
|
bool do_csum = mca_pml_dr.enable_csum &&
|
|
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
|
|
|
/* allocate descriptor */
|
|
mca_bml_base_alloc(bml_btl, &descriptor,
|
|
MCA_BTL_NO_ORDER,
|
|
sizeof(mca_pml_dr_match_hdr_t) + size,
|
|
MCA_BTL_DES_FLAGS_PRIORITY);
|
|
if(NULL == descriptor) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
sendreq->req_descriptor = descriptor; /* hang on to this for later */
|
|
segment = descriptor->des_src;
|
|
|
|
/* pack the data into the supplied buffer */
|
|
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + sizeof(mca_pml_dr_match_hdr_t));
|
|
iov.iov_len = size;
|
|
iov_count = 1;
|
|
max_data = size;
|
|
if(size > 0) {
|
|
if((rc = opal_convertor_pack( &sendreq->req_send.req_base.req_convertor,
|
|
&iov,
|
|
&iov_count,
|
|
&max_data)) < 0) {
|
|
mca_bml_base_free(bml_btl, descriptor);
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
/* build match header */
|
|
hdr = (mca_pml_dr_hdr_t*)segment->seg_addr.pval;
|
|
hdr->hdr_common.hdr_flags = 0;
|
|
hdr->hdr_common.hdr_csum = 0;
|
|
hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_MATCH;
|
|
hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst;
|
|
hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
hdr->hdr_common.hdr_src = sendreq->req_endpoint->src;
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
|
hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence;
|
|
hdr->hdr_match.hdr_csum = (size > 0 && do_csum ?
|
|
sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO);
|
|
hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0;
|
|
hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id;
|
|
hdr->hdr_common.hdr_csum = (do_csum ?
|
|
opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t)) :
|
|
OPAL_CSUM_ZERO);
|
|
|
|
/* vfrag status */
|
|
sendreq->req_vfrag0.vf_size = max_data;
|
|
sendreq->req_vfrag0.bml_btl = bml_btl;
|
|
sendreq->req_vfrag0.vf_pending = 1;
|
|
|
|
/* short message */
|
|
descriptor->des_cbfunc = mca_pml_dr_match_completion;
|
|
descriptor->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
|
|
descriptor->des_cbdata = &sendreq->req_vfrag0;
|
|
segment->seg_len = sizeof(mca_pml_dr_match_hdr_t) + max_data;
|
|
|
|
/* signal request completion */
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
MCA_PML_DR_SEND_REQUEST_MPI_COMPLETE(sendreq);
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
/* send */
|
|
MCA_PML_DR_VFRAG_WDOG_START(&sendreq->req_vfrag0);
|
|
rc = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_PML);
|
|
if(OMPI_SUCCESS != rc) {
|
|
mca_bml_base_free(bml_btl, descriptor );
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
* BTL can send directly from user buffer so allow the BTL
|
|
* to prepare the segment list.
|
|
*/
|
|
|
|
int mca_pml_dr_send_request_start_prepare(
|
|
mca_pml_dr_send_request_t* sendreq,
|
|
mca_bml_base_btl_t* bml_btl,
|
|
size_t size)
|
|
{
|
|
mca_btl_base_descriptor_t* descriptor;
|
|
mca_btl_base_segment_t* segment;
|
|
mca_pml_dr_hdr_t* hdr;
|
|
int rc;
|
|
bool do_csum = mca_pml_dr.enable_csum &&
|
|
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
|
|
|
/* prepare descriptor */
|
|
mca_bml_base_prepare_src( bml_btl,
|
|
NULL,
|
|
&sendreq->req_send.req_base.req_convertor,
|
|
MCA_BTL_NO_ORDER,
|
|
sizeof(mca_pml_dr_match_hdr_t),
|
|
&size,
|
|
MCA_BTL_DES_FLAGS_PRIORITY,
|
|
&descriptor );
|
|
if(NULL == descriptor) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
sendreq->req_descriptor = descriptor; /* hang on to this for later */
|
|
segment = descriptor->des_src;
|
|
|
|
/* build match header */
|
|
hdr = (mca_pml_dr_hdr_t*)segment->seg_addr.pval;
|
|
hdr->hdr_common.hdr_flags = 0;
|
|
hdr->hdr_common.hdr_csum = 0;
|
|
hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_MATCH;
|
|
hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst;
|
|
hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
hdr->hdr_common.hdr_src = sendreq->req_endpoint->src;
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
|
hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence;
|
|
hdr->hdr_match.hdr_csum = (size > 0 && do_csum ?
|
|
sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO);
|
|
hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0;
|
|
hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id;
|
|
hdr->hdr_common.hdr_csum = (do_csum ?
|
|
opal_csum(hdr, sizeof(mca_pml_dr_match_hdr_t)) :
|
|
OPAL_CSUM_ZERO);
|
|
|
|
/* short message */
|
|
descriptor->des_cbfunc = mca_pml_dr_match_completion;
|
|
descriptor->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
|
|
descriptor->des_cbdata = &sendreq->req_vfrag0;
|
|
|
|
/* vfrag state */
|
|
sendreq->req_vfrag0.vf_size = size;
|
|
sendreq->req_vfrag0.bml_btl = bml_btl;
|
|
sendreq->req_vfrag0.vf_pending = 1;
|
|
|
|
/* send */
|
|
MCA_PML_DR_VFRAG_WDOG_START(&sendreq->req_vfrag0);
|
|
rc = mca_bml_base_send(bml_btl, descriptor, MCA_BTL_TAG_PML);
|
|
if(OMPI_SUCCESS != rc) {
|
|
mca_bml_base_free(bml_btl, descriptor );
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
|
|
/**
|
|
* Rendezvous is required. Eager send up to
|
|
* the btls eager limit.
|
|
*/
|
|
|
|
int mca_pml_dr_send_request_start_rndv(
|
|
mca_pml_dr_send_request_t* sendreq,
|
|
mca_bml_base_btl_t* bml_btl,
|
|
size_t size,
|
|
int flags)
|
|
{
|
|
mca_btl_base_descriptor_t* des;
|
|
mca_btl_base_segment_t* segment;
|
|
mca_pml_dr_hdr_t* hdr;
|
|
int rc;
|
|
bool do_csum = mca_pml_dr.enable_csum &&
|
|
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
|
|
|
|
|
/* prepare descriptor */
|
|
if(size == 0) {
|
|
mca_bml_base_alloc( bml_btl,
|
|
&des,
|
|
MCA_BTL_NO_ORDER,
|
|
sizeof(mca_pml_dr_rendezvous_hdr_t),
|
|
MCA_BTL_DES_FLAGS_PRIORITY );
|
|
} else {
|
|
mca_bml_base_prepare_src( bml_btl,
|
|
NULL,
|
|
&sendreq->req_send.req_base.req_convertor,
|
|
MCA_BTL_NO_ORDER,
|
|
sizeof(mca_pml_dr_rendezvous_hdr_t),
|
|
&size,
|
|
MCA_BTL_DES_FLAGS_PRIORITY,
|
|
&des );
|
|
}
|
|
|
|
if(NULL == des) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
sendreq->req_descriptor = des; /*hang on to this for later */
|
|
segment = des->des_src;
|
|
|
|
/* build hdr */
|
|
hdr = (mca_pml_dr_hdr_t*)segment->seg_addr.pval;
|
|
hdr->hdr_common.hdr_flags = flags;
|
|
hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_RNDV;
|
|
hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst;
|
|
hdr->hdr_common.hdr_src = sendreq->req_endpoint->src;
|
|
hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
hdr->hdr_common.hdr_vid = sendreq->req_vfrag0.vf_id;
|
|
hdr->hdr_common.hdr_csum = 0;
|
|
hdr->hdr_match.hdr_tag = sendreq->req_send.req_base.req_tag;
|
|
hdr->hdr_match.hdr_seq = sendreq->req_send.req_base.req_sequence;
|
|
hdr->hdr_match.hdr_src_ptr.pval = &sendreq->req_vfrag0;
|
|
hdr->hdr_match.hdr_csum = size > 0 ? sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO;
|
|
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
|
|
hdr->hdr_common.hdr_csum = (do_csum ?
|
|
opal_csum(hdr, sizeof(mca_pml_dr_rendezvous_hdr_t)) :
|
|
OPAL_CSUM_ZERO);
|
|
|
|
|
|
/* first fragment of a long message */
|
|
des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
|
|
des->des_cbdata = &sendreq->req_vfrag0;
|
|
des->des_cbfunc = mca_pml_dr_rndv_completion;
|
|
|
|
/* vfrag state */
|
|
sendreq->req_vfrag0.vf_size = size;
|
|
sendreq->req_vfrag0.bml_btl = bml_btl;
|
|
sendreq->req_vfrag0.vf_state |= MCA_PML_DR_VFRAG_RNDV;
|
|
sendreq->req_vfrag0.vf_pending = 1;
|
|
|
|
/* send */
|
|
MCA_PML_DR_VFRAG_WDOG_START(&sendreq->req_vfrag0);
|
|
rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML);
|
|
if(OMPI_SUCCESS != rc) {
|
|
mca_bml_base_free(bml_btl, des );
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
* Schedule pipeline of send descriptors for the given request,
|
|
* using send protocol.
|
|
*/
|
|
|
|
int mca_pml_dr_send_request_schedule(mca_pml_dr_send_request_t* sendreq)
|
|
{
|
|
/*
|
|
* Only allow one thread in this routine for a given request.
|
|
* However, we cannot block callers on a mutex, so simply keep track
|
|
* of the number of times the routine has been called and run through
|
|
* the scheduling logic once for every call.
|
|
*/
|
|
|
|
mca_pml_dr_endpoint_t* endpoint = sendreq->req_endpoint;
|
|
assert(sendreq->req_vfrag0.vf_recv.pval != NULL);
|
|
if(OPAL_THREAD_ADD32(&sendreq->req_lock,1) == 1) {
|
|
do {
|
|
size_t bytes_remaining;
|
|
/*
|
|
* VFrags w/ nacks or that timed out
|
|
*/
|
|
while(opal_list_get_size(&sendreq->req_retrans) &&
|
|
sendreq->req_pipeline_depth < mca_pml_dr.send_pipeline_depth) {
|
|
mca_pml_dr_vfrag_t* vfrag;
|
|
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
vfrag = (mca_pml_dr_vfrag_t*)opal_list_get_first(&sendreq->req_retrans);
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
if(NULL == vfrag) {
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Retransmit fragments that have not been acked.
|
|
*/
|
|
while(vfrag->vf_idx < vfrag->vf_len &&
|
|
sendreq->req_pipeline_depth < mca_pml_dr.send_pipeline_depth) {
|
|
if(((uint64_t)1 << vfrag->vf_idx) & ~vfrag->vf_ack) {
|
|
mca_bml_base_btl_t* bml_btl = vfrag->bml_btl;
|
|
mca_pml_dr_frag_hdr_t* hdr;
|
|
mca_btl_base_descriptor_t* des;
|
|
size_t offset_in_vfrag = vfrag->vf_max_send_size * vfrag->vf_idx;
|
|
size_t offset_in_msg = vfrag->vf_offset + offset_in_vfrag;
|
|
size_t size;
|
|
int rc;
|
|
bool do_csum = mca_pml_dr.enable_csum &&
|
|
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
|
|
|
if(vfrag->vf_idx == vfrag->vf_len - 1) {
|
|
size = vfrag->vf_size - offset_in_vfrag;
|
|
} else {
|
|
size = vfrag->vf_max_send_size;
|
|
}
|
|
|
|
/* pack into a descriptor */
|
|
opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, &offset_in_msg);
|
|
mca_bml_base_prepare_src( bml_btl,
|
|
NULL,
|
|
&sendreq->req_send.req_base.req_convertor,
|
|
MCA_BTL_NO_ORDER,
|
|
sizeof(mca_pml_dr_frag_hdr_t),
|
|
&size,
|
|
0,
|
|
&des );
|
|
if(des == NULL) {
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*)sendreq);
|
|
opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq);
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
break;
|
|
}
|
|
des->des_cbfunc = mca_pml_dr_frag_completion;
|
|
des->des_cbdata = vfrag;
|
|
|
|
/* setup header */
|
|
hdr = (mca_pml_dr_frag_hdr_t*)des->des_src->seg_addr.pval;
|
|
hdr->hdr_common.hdr_flags = 0;
|
|
hdr->hdr_common.hdr_csum = 0;
|
|
hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_FRAG;
|
|
hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst;
|
|
hdr->hdr_common.hdr_vid = vfrag->vf_id;
|
|
hdr->hdr_common.hdr_src = sendreq->req_endpoint->src;
|
|
hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
hdr->hdr_vlen = vfrag->vf_len;
|
|
hdr->hdr_frag_idx = vfrag->vf_idx;
|
|
hdr->hdr_frag_csum = sendreq->req_send.req_base.req_convertor.checksum;
|
|
hdr->hdr_frag_offset = offset_in_msg;
|
|
hdr->hdr_src_ptr.pval = vfrag;
|
|
hdr->hdr_dst_ptr = sendreq->req_vfrag0.vf_recv;
|
|
hdr->hdr_common.hdr_csum = (do_csum ?
|
|
opal_csum(hdr, sizeof(mca_pml_dr_frag_hdr_t)) :
|
|
OPAL_CSUM_ZERO);
|
|
|
|
/* adjust number of outstanding operations */
|
|
if(OPAL_THREAD_ADD64(&vfrag->vf_pending, 1) == 1) {
|
|
MCA_PML_DR_VFRAG_WDOG_START(vfrag);
|
|
}
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,1);
|
|
|
|
/* adjust send offset - may not have finished scheduling entire vfrag */
|
|
if(offset_in_msg + size > sendreq->req_send_offset) {
|
|
sendreq->req_send_offset = offset_in_msg + size;
|
|
}
|
|
|
|
/* initiate send - note that this may complete before the call returns */
|
|
rc = mca_bml_base_send( bml_btl, des, MCA_BTL_TAG_PML);
|
|
|
|
if(rc == OMPI_SUCCESS) {
|
|
bytes_remaining -= size;
|
|
} else {
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1);
|
|
mca_bml_base_free(bml_btl,des);
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*) sendreq);
|
|
opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq);
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
break;
|
|
}
|
|
}
|
|
vfrag->vf_idx++;
|
|
}
|
|
|
|
/* remove from retrans list */
|
|
if(vfrag->vf_idx == vfrag->vf_len) {
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
opal_list_remove_item(&sendreq->req_retrans, (opal_list_item_t*)vfrag);
|
|
vfrag->vf_state &= ~MCA_PML_DR_VFRAG_RETRANS;
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
}
|
|
}
|
|
|
|
/* allocate remaining bytes to BTLs */
|
|
bytes_remaining = sendreq->req_send.req_bytes_packed - sendreq->req_send_offset;
|
|
while(bytes_remaining > 0 &&
|
|
sendreq->req_pipeline_depth < mca_pml_dr.send_pipeline_depth) {
|
|
|
|
mca_pml_dr_frag_hdr_t* hdr;
|
|
mca_btl_base_descriptor_t* des;
|
|
mca_bml_base_btl_t* bml_btl = NULL;
|
|
mca_pml_dr_vfrag_t* vfrag = sendreq->req_vfrag;
|
|
size_t size = bytes_remaining;
|
|
|
|
/* offset tells us how much of the vfrag has been scheduled */
|
|
size_t bytes_sent = sendreq->req_send_offset - vfrag->vf_offset;
|
|
int rc;
|
|
bool do_csum;
|
|
|
|
/* do we need to allocate a new vfrag
|
|
(we scheduled all the vfrag already) */
|
|
if(vfrag->vf_size == bytes_sent) {
|
|
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->bml_endpoint->btl_send);
|
|
MCA_PML_DR_VFRAG_ALLOC(vfrag,rc);
|
|
if(NULL == vfrag) {
|
|
OPAL_THREAD_LOCK(&mca_pml_dr.lock);
|
|
opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*)sendreq);
|
|
opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq);
|
|
OPAL_THREAD_UNLOCK(&mca_pml_dr.lock);
|
|
break;
|
|
}
|
|
MCA_PML_DR_SEND_REQUEST_VFRAG_INIT(sendreq,endpoint,bytes_remaining,vfrag);
|
|
vfrag->bml_btl = bml_btl;
|
|
bytes_sent = 0;
|
|
|
|
} else { /* always schedule the vfrag accross the same btl */
|
|
bml_btl = vfrag->bml_btl;
|
|
}
|
|
|
|
do_csum = mca_pml_dr.enable_csum &&
|
|
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
|
|
|
/* makes sure that we don't exceed vfrag size */
|
|
if (size > vfrag->vf_max_send_size) {
|
|
size = vfrag->vf_max_send_size;
|
|
}
|
|
if (size > vfrag->vf_size - bytes_sent) {
|
|
size = vfrag->vf_size - bytes_sent;
|
|
}
|
|
|
|
/* pack into a descriptor */
|
|
opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, &sendreq->req_send_offset);
|
|
mca_bml_base_prepare_src( bml_btl,
|
|
NULL,
|
|
&sendreq->req_send.req_base.req_convertor,
|
|
MCA_BTL_NO_ORDER,
|
|
sizeof(mca_pml_dr_frag_hdr_t),
|
|
&size,
|
|
0,
|
|
&des );
|
|
if(des == NULL) {
|
|
OPAL_THREAD_LOCK(&mca_pml_dr.lock);
|
|
opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*)sendreq);
|
|
opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq);
|
|
OPAL_THREAD_UNLOCK(&mca_pml_dr.lock);
|
|
break;
|
|
}
|
|
des->des_cbfunc = mca_pml_dr_frag_completion;
|
|
des->des_cbdata = vfrag;
|
|
|
|
/* setup header */
|
|
hdr = (mca_pml_dr_frag_hdr_t*)des->des_src->seg_addr.pval;
|
|
hdr->hdr_common.hdr_flags = 0;
|
|
hdr->hdr_common.hdr_csum = 0;
|
|
hdr->hdr_common.hdr_type = MCA_PML_DR_HDR_TYPE_FRAG;
|
|
hdr->hdr_common.hdr_dst = sendreq->req_endpoint->dst;
|
|
hdr->hdr_common.hdr_vid = vfrag->vf_id;
|
|
hdr->hdr_common.hdr_src = sendreq->req_endpoint->src;
|
|
hdr->hdr_common.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
|
hdr->hdr_vlen = vfrag->vf_len;
|
|
hdr->hdr_frag_idx = vfrag->vf_idx;
|
|
hdr->hdr_frag_csum = (do_csum ?
|
|
sendreq->req_send.req_base.req_convertor.checksum : OPAL_CSUM_ZERO);
|
|
hdr->hdr_frag_offset = sendreq->req_send_offset;
|
|
hdr->hdr_src_ptr.pval = vfrag;
|
|
hdr->hdr_dst_ptr = sendreq->req_vfrag0.vf_recv;
|
|
hdr->hdr_common.hdr_csum = (do_csum ?
|
|
opal_csum(hdr, sizeof(mca_pml_dr_frag_hdr_t)): OPAL_CSUM_ZERO);
|
|
|
|
assert(hdr->hdr_frag_offset < sendreq->req_send.req_bytes_packed);
|
|
|
|
/* update state */
|
|
vfrag->vf_idx++;
|
|
if(OPAL_THREAD_ADD64(&vfrag->vf_pending,1) == 1) {
|
|
MCA_PML_DR_VFRAG_WDOG_START(vfrag);
|
|
}
|
|
sendreq->req_send_offset += size;
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,1);
|
|
|
|
/* initiate send - note that this may complete before the call returns */
|
|
rc = mca_bml_base_send( bml_btl, des, MCA_BTL_TAG_PML);
|
|
|
|
if(rc == OMPI_SUCCESS) {
|
|
bytes_remaining -= size;
|
|
} else {
|
|
sendreq->req_send_offset -= size;
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-1);
|
|
mca_bml_base_free(bml_btl,des);
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
opal_list_remove_item(&mca_pml_dr.send_active, (opal_list_item_t*)sendreq);
|
|
opal_list_append(&mca_pml_dr.send_pending, (opal_list_item_t*)sendreq);
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
break;
|
|
}
|
|
}
|
|
} while (OPAL_THREAD_ADD32(&sendreq->req_lock,-1) > 0);
|
|
}
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
|
|
/**
|
|
* Acknowledgment of match vfrag.
|
|
*/
|
|
|
|
void mca_pml_dr_send_request_match_ack(
|
|
mca_btl_base_module_t* btl,
|
|
mca_pml_dr_ack_hdr_t* ack)
|
|
{
|
|
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)ack->hdr_src_ptr.pval;
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
|
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
|
|
vfrag->vf_ack = ack->hdr_vmask & vfrag->vf_mask;
|
|
if (vfrag->vf_pending == 0) {
|
|
MCA_PML_DR_VFRAG_ACK_STOP(vfrag);
|
|
/* need to retransmit? */
|
|
if(vfrag->vf_ack != vfrag->vf_mask) {
|
|
MCA_PML_DR_VFRAG_WDOG_START(vfrag);
|
|
MCA_PML_DR_SEND_REQUEST_EAGER_RETRY(sendreq, vfrag);
|
|
} else {
|
|
/* if already have local completion free descriptor and complete message */
|
|
/* return descriptor */
|
|
if(NULL != sendreq->req_descriptor) {
|
|
if(NULL != sendreq->req_descriptor->des_context) {
|
|
mca_bml_base_free((mca_bml_base_btl_t*)sendreq->req_descriptor->des_context, sendreq->req_descriptor );
|
|
}
|
|
sendreq->req_descriptor = NULL;
|
|
}
|
|
|
|
/* update statistics */
|
|
sendreq->req_bytes_delivered = vfrag->vf_size;
|
|
MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n",
|
|
__FILE__, __LINE__, vfrag->vf_id));
|
|
ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id);
|
|
MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq);
|
|
}
|
|
|
|
/* wait for local completion */
|
|
} else {
|
|
/* need to retransmit? */
|
|
if (vfrag->vf_ack != vfrag->vf_mask) {
|
|
vfrag->vf_state |= MCA_PML_DR_VFRAG_NACKED;
|
|
} else {
|
|
vfrag->vf_recv = ack->hdr_dst_ptr;
|
|
}
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
}
|
|
|
|
/**
|
|
* Acknowledgment of rendezvous vfrag.
|
|
*/
|
|
|
|
void mca_pml_dr_send_request_rndv_ack(
|
|
mca_btl_base_module_t* btl,
|
|
mca_pml_dr_ack_hdr_t* ack)
|
|
{
|
|
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)ack->hdr_src_ptr.pval;
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
|
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
|
|
/* set acked bits */
|
|
vfrag->vf_ack = ack->hdr_vmask & vfrag->vf_mask;
|
|
|
|
/* local completion? */
|
|
if (vfrag->vf_pending == 0) {
|
|
bool schedule = false;
|
|
MCA_PML_DR_VFRAG_ACK_STOP(vfrag);
|
|
|
|
/* need to retransmit? */
|
|
if(vfrag->vf_ack != vfrag->vf_mask) {
|
|
/* got a NACK, resend eager data! */
|
|
MCA_PML_DR_VFRAG_WDOG_START(vfrag);
|
|
MCA_PML_DR_SEND_REQUEST_EAGER_RETRY(sendreq, vfrag);
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
} else {
|
|
/* return descriptor of first fragment */
|
|
if(NULL != sendreq->req_descriptor) {
|
|
if(NULL != sendreq->req_descriptor->des_context) {
|
|
mca_bml_base_free((mca_bml_base_btl_t*)sendreq->req_descriptor->des_context, sendreq->req_descriptor);
|
|
}
|
|
sendreq->req_descriptor = NULL;
|
|
}
|
|
|
|
/* done? */
|
|
sendreq->req_send_offset = ack->hdr_vlen;
|
|
sendreq->req_bytes_delivered = ack->hdr_vlen;
|
|
if(sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) {
|
|
MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq);
|
|
} else {
|
|
/* start scheduling with a new vfrag */
|
|
vfrag->vf_recv = ack->hdr_dst_ptr;
|
|
/* vfrag->vf_state = 0; */
|
|
/* sendreq->req_send_offset = ack->hdr_vlen; */
|
|
vfrag->vf_state = 0;
|
|
vfrag->vf_size = ack->hdr_vlen;
|
|
schedule = true;
|
|
}
|
|
|
|
/* stash the vfrag id for duplicate acks.. */
|
|
MCA_PML_DR_DEBUG(1, (0, "%s:%d: inserting vid %d into sequence tracker\n",
|
|
__FILE__, __LINE__, vfrag->vf_id));
|
|
|
|
ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id);
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
if(schedule) {
|
|
mca_pml_dr_send_request_schedule(sendreq);
|
|
}
|
|
}
|
|
|
|
/* wait for local completion */
|
|
} else {
|
|
/* need to retransmit? */
|
|
if(vfrag->vf_ack != vfrag->vf_mask) {
|
|
vfrag->vf_state |= MCA_PML_DR_VFRAG_NACKED;
|
|
} else {
|
|
/* will need this to schedule rest of the message */
|
|
vfrag->vf_recv = ack->hdr_dst_ptr;
|
|
vfrag->vf_state = 0;
|
|
vfrag->vf_size = ack->hdr_vlen;
|
|
sendreq->req_send_offset = ack->hdr_vlen;
|
|
sendreq->req_bytes_delivered = ack->hdr_vlen;
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Acknowledgment of vfrag.
|
|
*/
|
|
|
|
void mca_pml_dr_send_request_frag_ack(
|
|
mca_btl_base_module_t* btl,
|
|
mca_pml_dr_ack_hdr_t* ack)
|
|
{
|
|
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*)ack->hdr_src_ptr.pval;
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
|
bool schedule = false;
|
|
|
|
MCA_PML_DR_VFRAG_ACK_STOP(vfrag);
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
|
|
|
/* add in acknowledged fragments */
|
|
vfrag->vf_ack |= (ack->hdr_vmask & vfrag->vf_mask);
|
|
|
|
/* need to retransmit? */
|
|
if(vfrag->vf_ack != vfrag->vf_mask) {
|
|
MCA_PML_DR_DEBUG(0,(0, "got a vfrag NACK, retransmitting %llx\n", (long long)~vfrag->vf_ack));
|
|
MCA_PML_DR_SEND_REQUEST_VFRAG_RETRANS(sendreq,vfrag);
|
|
schedule = true;
|
|
|
|
/* acked and local completion */
|
|
} else if (vfrag->vf_pending == 0 && vfrag->vf_idx == vfrag->vf_len) {
|
|
|
|
/* update statistics */
|
|
sendreq->req_bytes_delivered += vfrag->vf_size;
|
|
assert(sendreq->req_bytes_delivered <= sendreq->req_send.req_bytes_packed);
|
|
|
|
/* stash the vfid for duplicate acks.. */
|
|
MCA_PML_DR_DEBUG(1,(0, "%s:%d: inserting vid %d into sequence tracker\n",
|
|
__FILE__, __LINE__, vfrag->vf_id));
|
|
|
|
ompi_seq_tracker_insert(&sendreq->req_endpoint->seq_sends, vfrag->vf_id);
|
|
/* return vfrag */
|
|
MCA_PML_DR_VFRAG_RETURN(vfrag);
|
|
|
|
/* are we done with this request ? */
|
|
if(sendreq->req_bytes_delivered == sendreq->req_send.req_bytes_packed) {
|
|
MCA_PML_DR_SEND_REQUEST_PML_COMPLETE(sendreq);
|
|
/* is there something left to schedule */
|
|
} else if (sendreq->req_send_offset < sendreq->req_send.req_bytes_packed) {
|
|
schedule = true;
|
|
}
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
if(schedule) {
|
|
mca_pml_dr_send_request_schedule(sendreq);
|
|
}
|
|
}
|
|
|
|
|
|
void mca_pml_dr_sendreq_cleanup_active(mca_btl_base_module_t* btl) {
|
|
opal_list_item_t* item;
|
|
|
|
for (item = opal_list_get_first(&mca_pml_dr.send_active) ;
|
|
item != opal_list_get_end(&mca_pml_dr.send_active) ;
|
|
item = opal_list_get_next(item)) {
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*) item;
|
|
mca_btl_base_descriptor_t* des = sendreq->req_descriptor;
|
|
/* The des may be NULL if
|
|
* first fragment of rndv was delivered */
|
|
if (NULL != des) {
|
|
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
|
|
if( bml_btl && bml_btl->btl == btl) {
|
|
des->des_context = NULL;
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|