2188 строки
107 KiB
C
2188 строки
107 KiB
C
|
/*
|
||
|
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||
|
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||
|
* All rights reserved.
|
||
|
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||
|
* Copyright (c) 2014 Research Organization for Information Science
|
||
|
* and Technology (RIST). All rights reserved.
|
||
|
* $COPYRIGHT$
|
||
|
*
|
||
|
* Additional copyrights may follow
|
||
|
*
|
||
|
* $HEADER$
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* @file
|
||
|
* Functions that implement failover capabilities. To utilize the
|
||
|
* failover feature, one needs to configure the library with
|
||
|
* --enable-openib-failover. Then the system that is being used
|
||
|
* must have two or more openib BTLs in use. When an error occurs,
|
||
|
* the BTL will call into this PML to map out the offending BTL and
|
||
|
* continue using the one that is still working.
|
||
|
* Most of the differences between the ob1 PML and the bfo PML are
|
||
|
* contained in this file.
|
||
|
*/
|
||
|
|
||
|
#include "ompi_config.h"
|
||
|
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#include "opal/class/opal_bitmap.h"
|
||
|
#include "opal/mca/btl/btl.h"
|
||
|
#include "opal/mca/btl/base/base.h"
|
||
|
#include "ompi/mca/pml/pml.h"
|
||
|
#include "ompi/mca/pml/base/base.h"
|
||
|
#include "ompi/mca/pml/base/base.h"
|
||
|
#include "pml_bfo.h"
|
||
|
#include "pml_bfo_component.h"
|
||
|
#include "pml_bfo_comm.h"
|
||
|
#include "pml_bfo_hdr.h"
|
||
|
#include "pml_bfo_recvfrag.h"
|
||
|
#include "pml_bfo_sendreq.h"
|
||
|
#include "pml_bfo_recvreq.h"
|
||
|
#include "pml_bfo_rdmafrag.h"
|
||
|
#include "pml_bfo_failover.h"
|
||
|
#include "ompi/mca/bml/base/base.h"
|
||
|
|
||
|
#include "ompi/runtime/ompi_cr.h"
|
||
|
|
||
|
static void mca_pml_bfo_error_pending_packets(mca_btl_base_module_t* btl,
|
||
|
mca_bml_base_endpoint_t* ep);
|
||
|
|
||
|
/**
|
||
|
* When running with failover enabled, check the PML sequence numbers
|
||
|
* to see if we have received a duplicate message. This check is done
|
||
|
* for for all MATCH fragments. It is also done for RNDV and RGET
|
||
|
* fragments that do not have the MCA_PML_BFO_HDR_FLAGS_RESTART flag
|
||
|
* set.
|
||
|
* We set the window size to half the total range of sequence numbers.
|
||
|
* We only enter this code when the seq_num is not the expected one.
|
||
|
* A few more notes on the algorithm used here. In normal operation,
|
||
|
* the expected value will either be equal to or less than the
|
||
|
* sequence number of the header. This is because we are using this
|
||
|
* sequence number to detect packets arriving prior to them being
|
||
|
* expected. If we determine that expected is less than header, then
|
||
|
* make sure this is not a rollover case. We do that by adding the
|
||
|
* maxnum to the expected.
|
||
|
* @param proc Pointer to proc from where message came
|
||
|
* @param hdr Pointer to header of message
|
||
|
*/
|
||
|
bool mca_pml_bfo_is_duplicate_msg(mca_pml_bfo_comm_proc_t* proc,
|
||
|
mca_pml_bfo_match_hdr_t *hdr)
|
||
|
{
|
||
|
const int window = 32768;
|
||
|
const int maxnum = 65536;
|
||
|
mca_pml_bfo_recv_frag_t *frag;
|
||
|
|
||
|
#if 0
|
||
|
opal_output(0, "checking dup, exp=%d, act=%d, type=%d, cant_match=%d\n",
|
||
|
(uint16_t)proc->expected_sequence,
|
||
|
hdr->hdr_seq, hdr->hdr_common.hdr_type,
|
||
|
opal_list_get_size(&proc->frags_cant_match));
|
||
|
#endif
|
||
|
|
||
|
/* Few cases near end of values where expected may equal 65535 and
|
||
|
* an out of order shows up that may equal something like 1. */
|
||
|
if (OPAL_UNLIKELY((uint16_t)proc->expected_sequence > hdr->hdr_seq)) {
|
||
|
if (((uint16_t)proc->expected_sequence - hdr->hdr_seq) < window) {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"%s:%d: frag duplicated, exp=%d, act=%d, type=%d\n",
|
||
|
__FILE__, __LINE__, (uint16_t)proc->expected_sequence,
|
||
|
hdr->hdr_seq, hdr->hdr_common.hdr_type);
|
||
|
return true;
|
||
|
}
|
||
|
} else {
|
||
|
/* This is the normal flow through this code. We also need to
|
||
|
* use the maxnum to ensure that we handle cases where the
|
||
|
* expected number has rolled over but then a duplicate message
|
||
|
* shows up that is greater than it. */
|
||
|
if ((((uint16_t)proc->expected_sequence + maxnum) - hdr->hdr_seq) < window) {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"%s:%d: frag duplicated, exp=%d, act=%d, type=%d\n",
|
||
|
__FILE__, __LINE__, (uint16_t)proc->expected_sequence,
|
||
|
hdr->hdr_seq, hdr->hdr_common.hdr_type);
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Need to explicitly check against any out of order fragments. Unfortunately, we
|
||
|
* always have to do this since we can get a duplicate out of order fragment. */
|
||
|
if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) {
|
||
|
for(frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match);
|
||
|
frag != (mca_pml_bfo_recv_frag_t*)opal_list_get_end(&proc->frags_cant_match);
|
||
|
frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_next(frag))
|
||
|
{
|
||
|
mca_pml_bfo_match_hdr_t* mhdr = &frag->hdr.hdr_match;
|
||
|
|
||
|
if(mhdr->hdr_seq == hdr->hdr_seq) {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"%s:%d: frag duplicated on frags_cant_match list, seq=%d, type=%d\n",
|
||
|
__FILE__, __LINE__, hdr->hdr_seq, hdr->hdr_common.hdr_type);
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This function checks to see if we have received a duplicate FIN
|
||
|
* message. This is done by first pulling the pointer of the request
|
||
|
* that the FIN message is pointing to from the message. We then
|
||
|
* check the various fields in the request to the fields in the header
|
||
|
* and make sure they match. If they do not, then the request must
|
||
|
* have been recycled already and this is a duplicate FIN message. We
|
||
|
* have to do this check on every FIN message that we receive.
|
||
|
*/
|
||
|
bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descriptor_t* rdma,
|
||
|
mca_btl_base_module_t* btl)
|
||
|
{
|
||
|
mca_pml_base_request_t* basereq;
|
||
|
/* When running with failover enabled, need to ensure that this
|
||
|
* is not a duplicate FIN message. */
|
||
|
if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) {
|
||
|
/* The first check is to make sure the descriptor is pointing
|
||
|
* to a valid request. The descriptor may be pointing to NULL
|
||
|
* if it was freed and not reused yet. */
|
||
|
if (NULL == rdma->des_cbdata) {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"FIN: received: dropping because not pointing to valid descriptor "
|
||
|
"PML=%d CTX=%d SRC=%d RQS=%d",
|
||
|
hdr->hdr_fin.hdr_match.hdr_seq,
|
||
|
hdr->hdr_fin.hdr_match.hdr_ctx,
|
||
|
hdr->hdr_fin.hdr_match.hdr_src,
|
||
|
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
basereq = (mca_pml_base_request_t*)rdma->des_cbdata;
|
||
|
/* Now we know the descriptor is pointing to a non-null request.
|
||
|
* Does it match what we expect? To make sure the receiver request
|
||
|
* matches the FIN message, check the context number, source of the
|
||
|
* message, and MPI sequence number. Then make sure that it also
|
||
|
* matches the internal sequencing number of the requests. We need
|
||
|
* to look at the type of request we are pointing at to figure out
|
||
|
* what fields to access. */
|
||
|
if (basereq->req_type == MCA_PML_REQUEST_RECV) {
|
||
|
mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)basereq;
|
||
|
if ((hdr->hdr_fin.hdr_match.hdr_ctx !=
|
||
|
recvreq->req_recv.req_base.req_comm->c_contextid) ||
|
||
|
(hdr->hdr_fin.hdr_match.hdr_src !=
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) ||
|
||
|
(hdr->hdr_fin.hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
|
||
|
opal_output_verbose(5, mca_pml_bfo_output,
|
||
|
"FIN: received on receiver: dropping because no match "
|
||
|
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
|
||
|
"RQS:exp=%d,act=%d, dst_req=%p",
|
||
|
(uint16_t)recvreq->req_msgseq, hdr->hdr_fin.hdr_match.hdr_seq,
|
||
|
recvreq->req_recv.req_base.req_comm->c_contextid,
|
||
|
hdr->hdr_fin.hdr_match.hdr_ctx,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||
|
hdr->hdr_fin.hdr_match.hdr_src,
|
||
|
recvreq->req_restartseq,
|
||
|
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
|
||
|
(void *)recvreq);
|
||
|
return true;
|
||
|
}
|
||
|
if (hdr->hdr_fin.hdr_match.hdr_common.hdr_flags != recvreq->req_restartseq) {
|
||
|
opal_output_verbose(5, mca_pml_bfo_output,
|
||
|
"FIN: received on receiver: dropping because old "
|
||
|
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
|
||
|
"RQS:exp=%d,act=%d, dst_req=%p",
|
||
|
(uint16_t)recvreq->req_msgseq, hdr->hdr_fin.hdr_match.hdr_seq,
|
||
|
recvreq->req_recv.req_base.req_comm->c_contextid,
|
||
|
hdr->hdr_fin.hdr_match.hdr_ctx,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||
|
hdr->hdr_fin.hdr_match.hdr_src,
|
||
|
recvreq->req_restartseq,
|
||
|
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
|
||
|
(void *)recvreq);
|
||
|
return true;
|
||
|
}
|
||
|
} else if (basereq->req_type == MCA_PML_REQUEST_SEND) {
|
||
|
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)basereq;
|
||
|
if ((hdr->hdr_fin.hdr_match.hdr_ctx !=
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid) ||
|
||
|
(hdr->hdr_fin.hdr_match.hdr_src !=
|
||
|
sendreq->req_send.req_base.req_peer) ||
|
||
|
(hdr->hdr_fin.hdr_match.hdr_seq !=
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence)) {
|
||
|
uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||
|
opal_output_verbose(5, mca_pml_bfo_output,
|
||
|
"FIN: received on sender: dropping because no match "
|
||
|
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
|
||
|
"RQS:exp=%d,act=%d, dst_req=%p",
|
||
|
seq, hdr->hdr_fin.hdr_match.hdr_seq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid,
|
||
|
hdr->hdr_fin.hdr_match.hdr_ctx,
|
||
|
sendreq->req_send.req_base.req_peer,
|
||
|
hdr->hdr_fin.hdr_match.hdr_src,
|
||
|
sendreq->req_restartseq,
|
||
|
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
|
||
|
(void *)sendreq);
|
||
|
return true;
|
||
|
}
|
||
|
if (hdr->hdr_fin.hdr_match.hdr_common.hdr_flags != sendreq->req_restartseq) {
|
||
|
uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||
|
opal_output_verbose(5, mca_pml_bfo_output,
|
||
|
"FIN: received on sender: dropping because old "
|
||
|
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
|
||
|
"RQS:exp=%d,act=%d, dst_req=%p",
|
||
|
seq, hdr->hdr_fin.hdr_match.hdr_seq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid,
|
||
|
hdr->hdr_fin.hdr_match.hdr_ctx,
|
||
|
sendreq->req_send.req_base.req_peer,
|
||
|
hdr->hdr_fin.hdr_match.hdr_src,
|
||
|
sendreq->req_restartseq,
|
||
|
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
|
||
|
(void *)sendreq);
|
||
|
return true;
|
||
|
}
|
||
|
} else {
|
||
|
/* We can get here if the descriptor has been reused, but
|
||
|
* not as an RDMA descriptor. In that case, the callback
|
||
|
* function has been set to something else. Clearly the
|
||
|
* descriptor we are interested is gone, so just drop the
|
||
|
* FIN message. */
|
||
|
opal_output_verbose(5, mca_pml_bfo_output,
|
||
|
"FIN: received: dropping because descriptor has been reused "
|
||
|
"PML=%d CTX=%d SRC=%d RQS=%d rdma->des_flags=%d",
|
||
|
hdr->hdr_fin.hdr_match.hdr_seq, hdr->hdr_fin.hdr_match.hdr_ctx,
|
||
|
hdr->hdr_fin.hdr_match.hdr_src, hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
|
||
|
rdma->des_flags);
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Repost a FIN message if we get an error on the completion event.
|
||
|
*/
|
||
|
void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des) {
|
||
|
/* In the error case, we will repost the FIN message. I had
|
||
|
* considered restarting the request. The problem is that the
|
||
|
* request may be already complete when we detect that a FIN
|
||
|
* message got an error on its completion event. For example, with
|
||
|
* the PUT protocol, if the RDMA writes succeed and all the data
|
||
|
* has been sent, then the request is marked as complete and can be
|
||
|
* freed. Therefore, an error on the FIN message has no request to
|
||
|
* refer back to. So, we will just repost it. However, we are also
|
||
|
* faced with the case where the FIN message has an error but it
|
||
|
* actually makes it to the other side. In that case we are now
|
||
|
* sending a FIN message to a non-existent request on the receiver
|
||
|
* side. To handle that, we have added the match information to
|
||
|
* the FIN message. That way, we can check on the receiving side
|
||
|
* to ensure that it is pointing to a valid request. */
|
||
|
mca_pml_bfo_fin_hdr_t* hdr;
|
||
|
mca_bml_base_endpoint_t* bml_endpoint;
|
||
|
ompi_proc_t *proc;
|
||
|
mca_bml_base_btl_t* bml_btl;
|
||
|
|
||
|
proc = (ompi_proc_t*) des->des_cbdata;
|
||
|
bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
hdr = (mca_pml_bfo_fin_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"REPOST: BFO_HDR_TYPE_FIN: seq=%d,myrank=%d,peer=%d,hdr->hdr_fail=%d,src=%d",
|
||
|
hdr->hdr_match.hdr_seq, OMPI_PROC_MY_NAME->vpid, OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid,
|
||
|
hdr->hdr_fail, hdr->hdr_match.hdr_src);
|
||
|
|
||
|
bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
|
||
|
|
||
|
/* Reconstruct the fin for sending on the other BTL */
|
||
|
mca_pml_bfo_send_fin(proc, bml_btl,
|
||
|
hdr->hdr_des, MCA_BTL_NO_ORDER,
|
||
|
hdr->hdr_fail, hdr->hdr_match.hdr_seq,
|
||
|
hdr->hdr_match.hdr_common.hdr_flags,
|
||
|
hdr->hdr_match.hdr_ctx, hdr->hdr_match.hdr_src);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This function is called when a RNDV or RGET is received with the
|
||
|
* FLAGS_RESTART flag set. This means this message already has a
|
||
|
* receive request already associated with it.
|
||
|
*/
|
||
|
mca_pml_bfo_recv_request_t* mca_pml_bfo_get_request(mca_pml_bfo_match_hdr_t *hdr) {
|
||
|
mca_pml_bfo_recv_request_t *match = NULL;
|
||
|
mca_pml_bfo_rendezvous_hdr_t * rhdr = (mca_pml_bfo_rendezvous_hdr_t *) hdr;
|
||
|
match = (mca_pml_bfo_recv_request_t *) rhdr->hdr_dst_req.pval;
|
||
|
|
||
|
/* Check to see if we have received a duplicate RNDV (or RGET). This can
|
||
|
* occur because we got an error when we reposted the RNDV. Therefore,
|
||
|
* we make sure that the request has not completed from underneath us
|
||
|
* and been recycled. Secondly, make sure we are not getting it a
|
||
|
* second time for the same request. */
|
||
|
if ((rhdr->hdr_match.hdr_ctx != match->req_recv.req_base.req_comm->c_contextid) ||
|
||
|
(rhdr->hdr_match.hdr_src != match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) ||
|
||
|
(rhdr->hdr_match.hdr_seq != (uint16_t)match->req_msgseq) ||
|
||
|
(rhdr->hdr_restartseq == match->req_restartseq)) {
|
||
|
if (hdr->hdr_common.hdr_type == MCA_PML_BFO_HDR_TYPE_RNDV) {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"RNDV: received with RESTART flag: duplicate, dropping "
|
||
|
"PML:exp=%d,act=%d RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq,
|
||
|
match->remote_req_send.pval, (void *)match,
|
||
|
match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
} else {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"RGET: received with RESTART flag: duplicate, dropping "
|
||
|
"PML:exp=%d,act=%d RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq,
|
||
|
match->remote_req_send.pval, (void *)match,
|
||
|
match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
}
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
mca_pml_bfo_recv_request_reset(match);
|
||
|
if (hdr->hdr_common.hdr_type == MCA_PML_BFO_HDR_TYPE_RNDV) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDV: received with RESTART flag: restarting recv, "
|
||
|
"PML:exp=%d,act=%d RQS(new)=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq,
|
||
|
match->remote_req_send.pval, (void *)match,
|
||
|
match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
} else {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RGET: received with RESTART flag: restarting recv, "
|
||
|
"PML:exp=%d,act=%d RQS(new)=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq,
|
||
|
match->remote_req_send.pval, (void *)match,
|
||
|
match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
}
|
||
|
return match;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Callback for when a RNDVRESTARTNOTIFY message is received. A
|
||
|
* RNDVRESTARTNOTIFY message is sent from the sender to the receiver
|
||
|
* telling the receiver that the message is going to be started over.
|
||
|
* The receiver first makes sure that the request being pointed to is
|
||
|
* still valid. If it is not, that means the receiver must have
|
||
|
* completed the request and therefore we need to send a NACK back to
|
||
|
* the sender. The receiver then makes sure this is not a duplicate
|
||
|
* message. If it is a duplicate, it will just drop it. Otherwise,
|
||
|
* it will then send a RNDVRESTARTACK message if there are no
|
||
|
* outstanding events on the receiver. Otherwise, it will just change
|
||
|
* the state of the request and wait for another event to send the
|
||
|
* RNDVRESTARTACK to the sender.
|
||
|
*/
|
||
|
void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl,
|
||
|
mca_btl_base_tag_t tag,
|
||
|
mca_btl_base_descriptor_t* des,
|
||
|
void* cbdata ) {
|
||
|
mca_btl_base_segment_t* segments = des->des_local;
|
||
|
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||
|
mca_pml_bfo_recv_request_t* recvreq;
|
||
|
ompi_proc_t* ompi_proc;
|
||
|
ompi_process_name_t orte_proc;
|
||
|
|
||
|
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY);
|
||
|
recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_restart.hdr_dst_req.pval;
|
||
|
|
||
|
/* Check to see if the receive request is still valid. If the
|
||
|
* request is recycled, that means the original request must have
|
||
|
* completed and we therefore need to send a NACK back to the sender.
|
||
|
* Note that when the request is gone, we need to pull some information
|
||
|
* off the header so that we can figure out where to send the NACK
|
||
|
* message back to. */
|
||
|
if ((hdr->hdr_match.hdr_ctx != recvreq->req_recv.req_base.req_comm->c_contextid) ||
|
||
|
(hdr->hdr_match.hdr_src != recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) ||
|
||
|
(hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
|
||
|
orte_proc.jobid = hdr->hdr_restart.hdr_jobid;
|
||
|
orte_proc.vpid = hdr->hdr_restart.hdr_vpid;
|
||
|
|
||
|
ompi_proc = ompi_proc_find(&orte_proc);
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNOTIFY: received: does not match request, sending NACK back "
|
||
|
"PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
|
||
|
"RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, "
|
||
|
"hdr->hdr_vpid=%d, proc_hostname=%s",
|
||
|
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
|
||
|
recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||
|
hdr->hdr_match.hdr_src, recvreq->req_restartseq,
|
||
|
hdr->hdr_restart.hdr_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||
|
hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid,
|
||
|
(NULL == ompi_proc->super.proc_hostname) ? "unknown" : ompi_proc->super.proc_hostname);
|
||
|
mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/* We know that we have the correct receive request. Make sure this is not
|
||
|
* a duplicate RNDVRESTARTNOTIFY on this request. */
|
||
|
if (hdr->hdr_restart.hdr_restartseq == recvreq->req_restartseq) {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNOTIFY: received duplicate: dropping RNDVRESTARTNOTIFY "
|
||
|
"message PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d "
|
||
|
"RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d",
|
||
|
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
|
||
|
recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||
|
hdr->hdr_match.hdr_src, recvreq->req_restartseq,
|
||
|
hdr->hdr_restart.hdr_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/* Increment restart number. */
|
||
|
recvreq->req_restartseq++;
|
||
|
recvreq->req_errstate |= RECVREQ_RNDVRESTART_RECVED;
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNOTIFY: received: outstanding receive events=%d, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
|
||
|
if (0 == recvreq->req_events) {
|
||
|
mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY,
|
||
|
OMPI_SUCCESS, btl);
|
||
|
}
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Callback for when a RNDVRESTARTACK message is received. This
|
||
|
* message is sent from the receiver to the sender to acknowledge
|
||
|
* the receipt of the RNDVRESTARTNOTIFY message. At this point,
|
||
|
* the sender can reset the send request and restart the message.
|
||
|
*/
|
||
|
void mca_pml_bfo_recv_frag_callback_rndvrestartack(mca_btl_base_module_t* btl,
|
||
|
mca_btl_base_tag_t tag,
|
||
|
mca_btl_base_descriptor_t* des,
|
||
|
void* cbdata ) {
|
||
|
mca_btl_base_segment_t* segments = des->des_local;
|
||
|
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||
|
mca_pml_bfo_send_request_t* sendreq;
|
||
|
|
||
|
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK);
|
||
|
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval;
|
||
|
|
||
|
/* Check to see if we have received a duplicate message. The
|
||
|
* first three comparisons make sure that we are not looking at a
|
||
|
* recycled request. The last check makes sure we are not getting
|
||
|
* a duplicate message for this specific request. All of this is
|
||
|
* needed because the receiver might get an error and repost the
|
||
|
* RNDVRESTARTACK message, but the RNDVRESTARTACK was actually received. */
|
||
|
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
|
||
|
(hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) ||
|
||
|
(hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence) ||
|
||
|
(hdr->hdr_restart.hdr_restartseq != sendreq->req_restartseq)) {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTACK: received: does not match request, dropping "
|
||
|
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d EXP:exp=%d,act=%d "
|
||
|
"src_req=%p, dst_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
|
||
|
sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
|
||
|
sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq,
|
||
|
(void *)sendreq, sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
sendreq->req_restart++;
|
||
|
if (2 == sendreq->req_restart) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTACK: received: restarting send "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
hdr->hdr_match.hdr_seq, hdr->hdr_restart.hdr_restartseq,
|
||
|
(void *)sendreq, sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
mca_pml_bfo_send_request_restart(sendreq, false, 0);
|
||
|
} else {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTACK received: waiting for RNDVRESTARTNOTIFY completion "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
hdr->hdr_match.hdr_seq, hdr->hdr_restart.hdr_restartseq,
|
||
|
(void *)sendreq, sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Callback for when a RECVERRNOTIFY message is received. This message
|
||
|
* is sent from the receiver to the sender and tells the sender that
|
||
|
* the receiver has seen an error. This will trigger the sender
|
||
|
* to start the request restart sequence.
|
||
|
*/
|
||
|
void mca_pml_bfo_recv_frag_callback_recverrnotify(mca_btl_base_module_t* btl,
|
||
|
mca_btl_base_tag_t tag,
|
||
|
mca_btl_base_descriptor_t* des,
|
||
|
void* cbdata ) {
|
||
|
mca_btl_base_segment_t* segments = des->des_local;
|
||
|
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||
|
mca_pml_bfo_send_request_t* sendreq;
|
||
|
|
||
|
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY);
|
||
|
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval;
|
||
|
|
||
|
/* First make sure that this message is pointing to a valid request.
|
||
|
* This can be determined if the communicator context, the source of
|
||
|
* the message, and the MPI sequence number all match. */
|
||
|
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
|
||
|
(hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) ||
|
||
|
(hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"RECVERRNOTIFY: received: does not match request, dropping "
|
||
|
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d RQS:exp=%d,act=%d "
|
||
|
"src_req=%p, dst_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
|
||
|
sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
|
||
|
sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq,
|
||
|
(void *)sendreq, sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/* If a good ACK was never received, then the first ACK received
|
||
|
* might be a RECVERRNOTIFY message. In that case, the sendreq does not
|
||
|
* have a valid req_recv pointer in it. Therefore, check for that
|
||
|
* case and update the field in the sendreq if necessary. */
|
||
|
if (NULL == sendreq->req_recv.pval) {
|
||
|
sendreq->req_recv = hdr->hdr_restart.hdr_dst_req;
|
||
|
}
|
||
|
|
||
|
/* Now check to see a restart needs to be issued. The request
|
||
|
* sequence number in the header is compared against the current
|
||
|
* request sequence number in the send request. If the header
|
||
|
* sequence number is greater than or equal to the send request
|
||
|
* number, then a rndvrestartnotify is issued. There are some cases
|
||
|
* where a few extra rndvrestartnotifys are issued. That is OK as
|
||
|
* it will all work itself out. The idea is to prevent many
|
||
|
* restarts unnecessarily. This still allows multiple restarts to
|
||
|
* happen. It could be that sometime later another error occurs
|
||
|
* which initiates a restart. That is OK as it will have the new
|
||
|
* sequence number and all is well. */
|
||
|
if (hdr->hdr_restart.hdr_restartseq >= sendreq->req_restartseq) {
|
||
|
assert(sendreq->req_send.req_base.req_ompi.req_state == OMPI_REQUEST_ACTIVE);
|
||
|
sendreq->req_error++;
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RECVERRNOTIFY: received: sendreq has error, outstanding events=%d, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq, (void *)sendreq,
|
||
|
sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
|
||
|
if (0 == sendreq->req_events) {
|
||
|
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
|
||
|
MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY,
|
||
|
OMPI_SUCCESS, btl);
|
||
|
}
|
||
|
} else {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RECVERRNOTIFY: received: error has already been noted, ignoring "
|
||
|
"PML:exp=%d,act=%d RQS:exp=%d,act=%d src_req=%p, dst_req=%p, peer=%d",
|
||
|
sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq,
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq,
|
||
|
(void *)sendreq, sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Callback for when a RNDVRESTARTNACK message is received. This message
|
||
|
* is sent from the receiver to the sender and tells the sender that
|
||
|
* the receiver has already completed the message and there is nothing
|
||
|
* else to be done. The sender should then just make the send request
|
||
|
* complete.
|
||
|
*/
|
||
|
void mca_pml_bfo_recv_frag_callback_rndvrestartnack(mca_btl_base_module_t* btl,
|
||
|
mca_btl_base_tag_t tag,
|
||
|
mca_btl_base_descriptor_t* des,
|
||
|
void* cbdata ) {
|
||
|
|
||
|
mca_btl_base_segment_t* segments = des->des_local;
|
||
|
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
|
||
|
mca_pml_bfo_send_request_t* sendreq;
|
||
|
|
||
|
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK);
|
||
|
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval;
|
||
|
|
||
|
/* Not convinced a RNDVRESTARTNACK that does not match a request can
|
||
|
* happen, but have the check in here anyways for now */
|
||
|
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
|
||
|
(hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) ||
|
||
|
(hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence) ||
|
||
|
(hdr->hdr_restart.hdr_restartseq != sendreq->req_restartseq)) {
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNACK: received: does not match request, dropping "
|
||
|
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d EXP:exp=%d,act=%d "
|
||
|
"src_req=%p, dst_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx,
|
||
|
sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
|
||
|
sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq,
|
||
|
(void *)sendreq, sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
opal_output_verbose(20, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNACK: received: marking send request as complete "
|
||
|
"PML=%d CTX=%d SRC=%d EXP=%d "
|
||
|
"src_req=%p, dst_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid,
|
||
|
sendreq->req_send.req_base.req_peer, sendreq->req_restartseq,
|
||
|
(void *)sendreq, sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
/* Mark the sender complete. This data exchange is over. */
|
||
|
send_request_pml_complete(sendreq);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* This function gets called when failover is enabled and an error
|
||
|
* occurs during the rendezvous protocol. A message is sent to the
|
||
|
* receiving side notifying the request that the communication is
|
||
|
* going to be starting over. However, none of the information in the
|
||
|
* send request is reset yet, so that any in flight fragments can
|
||
|
* still find a home. Information in the send request gets reset when
|
||
|
* the completion event for this send occurs AND an ACK has been
|
||
|
* received back from the receiver.
|
||
|
*/
|
||
|
void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* sendreq,
|
||
|
bool repost, mca_btl_base_tag_t tag,
|
||
|
int status, mca_btl_base_module_t* btl)
|
||
|
{
|
||
|
mca_btl_base_descriptor_t* des;
|
||
|
mca_pml_bfo_restart_hdr_t* restart;
|
||
|
int rc;
|
||
|
mca_bml_base_btl_t* bml_btl;
|
||
|
ompi_proc_t* proc = (ompi_proc_t*)sendreq->req_send.req_base.req_proc;
|
||
|
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
|
||
|
/* If this message is not a repost, then update the sequence number. */
|
||
|
if (!repost) {
|
||
|
/* Bump up the rendezvous request sequence number. */
|
||
|
sendreq->req_restartseq++;
|
||
|
}
|
||
|
|
||
|
assert(0 == sendreq->req_events);
|
||
|
assert(0 != bml_endpoint->btl_eager.arr_size);
|
||
|
|
||
|
/* In the case that this is started because the receiver has
|
||
|
* sent us a message, then attempt to use a different BTL than the
|
||
|
* error message was received on. This may potentially tickle the
|
||
|
* error sooner if this side has not seen it yet. */
|
||
|
bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
|
||
|
if (bml_btl->btl == btl) {
|
||
|
/* If there is more than one BTL left, then we will get a
|
||
|
* different one. If there is only one, we will just get
|
||
|
* the same one back again. That is OK. */
|
||
|
bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
|
||
|
}
|
||
|
|
||
|
/* allocate descriptor */
|
||
|
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
|
||
|
sizeof(mca_pml_bfo_restart_hdr_t),
|
||
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||
|
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
|
||
|
if( OPAL_UNLIKELY(NULL == des) ) {
|
||
|
opal_output(0, "%s:%d Our of resources, cannot proceed", __FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
|
||
|
/* fill out header */
|
||
|
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
restart->hdr_match.hdr_common.hdr_flags = 0;
|
||
|
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY;
|
||
|
restart->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid;
|
||
|
restart->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank;
|
||
|
restart->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
|
||
|
restart->hdr_restartseq = sendreq->req_restartseq;
|
||
|
restart->hdr_src_req.pval = sendreq;
|
||
|
restart->hdr_dst_req = sendreq->req_recv;
|
||
|
restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */
|
||
|
restart->hdr_jobid = OMPI_PROC_MY_NAME->jobid;
|
||
|
restart->hdr_vpid = OMPI_PROC_MY_NAME->vpid;
|
||
|
|
||
|
bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc);
|
||
|
|
||
|
/* initialize descriptor */
|
||
|
des->des_cbfunc = mca_pml_bfo_rndvrestartnotify_completion;
|
||
|
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNOTIFY: sent: PML=%d, RQS(new)=%d, CTX=%d, SRC=%d, "
|
||
|
"src_req=%p, dst_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq,
|
||
|
restart->hdr_match.hdr_ctx, restart->hdr_match.hdr_src,
|
||
|
(void *)sendreq, sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
|
||
|
rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY);
|
||
|
if( OPAL_UNLIKELY( rc < 0 ) ) {
|
||
|
opal_output(0, "[%s:%d] Cannot send rndvrestartnotify message", __FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This function restarts a RNDV send request. When this is called,
|
||
|
* all the fields in the send request are reset and the send is
|
||
|
* started over. The sendreq->req_restartseq will be non-zero which will
|
||
|
* trigger a special flag in the RNDV header which indicates the match
|
||
|
* has already happened on the receiving side.
|
||
|
*/
|
||
|
void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq,
|
||
|
bool repost, mca_btl_base_tag_t tag)
|
||
|
{
|
||
|
size_t offset = 0;
|
||
|
opal_list_item_t *first_item;
|
||
|
opal_list_item_t *last_item;
|
||
|
mca_bml_base_endpoint_t* endpoint;
|
||
|
size_t i;
|
||
|
|
||
|
/* If the tag is something valid, it was a repost. We could also
|
||
|
* check the repost field as well. Maybe I can drop the
|
||
|
* repost and have the tag double as it. */
|
||
|
switch (tag) {
|
||
|
case MCA_PML_BFO_HDR_TYPE_RNDV:
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDV: completion failed, reset and repost: PML=%d, RQS=%d, "
|
||
|
"CTX=%d, SRC=%d, src_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid,
|
||
|
sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
break;
|
||
|
case MCA_PML_BFO_HDR_TYPE_RGET:
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RGET: completion failed, reset and repost: PML=%d, RQS=%d, "
|
||
|
"CTX=%d, SRC=%d, src_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid,
|
||
|
sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
break;
|
||
|
default:
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
/* Return mpool resources, they get reacquired when request starts over. */
|
||
|
mca_pml_bfo_free_rdma_resources(sendreq);
|
||
|
|
||
|
/* Release any memory in use if this is a buffered send */
|
||
|
if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED &&
|
||
|
sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) {
|
||
|
mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq);
|
||
|
}
|
||
|
|
||
|
/* Clear out any unsent send ranges. Recreate the functionality
|
||
|
* from the get_send_range() and get_next_send_range() functions. */
|
||
|
OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
|
||
|
first_item = opal_list_get_begin(&sendreq->req_send_ranges);
|
||
|
last_item = opal_list_get_last(&sendreq->req_send_ranges);
|
||
|
while (first_item != last_item) {
|
||
|
opal_list_remove_item(&sendreq->req_send_ranges, last_item);
|
||
|
OMPI_FREE_LIST_RETURN_MT(&mca_pml_bfo.send_ranges, (ompi_free_list_item_t *)last_item);
|
||
|
last_item = opal_list_get_last(&sendreq->req_send_ranges);
|
||
|
}
|
||
|
OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
|
||
|
|
||
|
/* Reset the converter to the beginning. */
|
||
|
opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor,
|
||
|
&offset);
|
||
|
|
||
|
/* Bump up internal sequence number to handle possible duplicate
|
||
|
* RNDV messages. In the case of reposting a RNDV message, do not
|
||
|
* increment the value. That way, a duplicate message can be
|
||
|
* detected. */
|
||
|
if (!repost) {
|
||
|
sendreq->req_restartseq++;
|
||
|
}
|
||
|
|
||
|
/* This code here is essentially the same is mca_pml_bfo_send_request_start()
|
||
|
* but with a few modifications since we are restarting the request, not
|
||
|
* starting entirely from scratch. */
|
||
|
endpoint = (mca_bml_base_endpoint_t*)sendreq->req_send.req_base.req_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
sendreq->req_endpoint = endpoint;
|
||
|
sendreq->req_state = 0;
|
||
|
sendreq->req_lock = 0;
|
||
|
sendreq->req_pipeline_depth = 0;
|
||
|
sendreq->req_bytes_delivered = 0;
|
||
|
sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
|
||
|
|
||
|
/* Note that we do not reset the following three items.
|
||
|
* They stay with their original values.
|
||
|
* sendreq->req_send.req_base.req_sequence
|
||
|
* sendreq->req_restartseq
|
||
|
* sendreq->req_recv.pval
|
||
|
*/
|
||
|
sendreq->req_restart = 0; /* reset in case we restart again */
|
||
|
sendreq->req_error = 0; /* clear error state */
|
||
|
sendreq->req_events = 0; /* clear events, probably 0 anyways */
|
||
|
|
||
|
MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );
|
||
|
|
||
|
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
|
||
|
mca_bml_base_btl_t* bml_btl;
|
||
|
int rc;
|
||
|
|
||
|
/* select a btl */
|
||
|
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||
|
rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
|
||
|
if(OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc))
|
||
|
return;
|
||
|
}
|
||
|
add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This function will repost a match fragment. This function has to
|
||
|
* handle the case where there may not be a request associated with
|
||
|
* the fragment and just use the information in the fragment to
|
||
|
* repost the send.
|
||
|
*/
|
||
|
void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des)
|
||
|
{
|
||
|
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
|
||
|
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
|
||
|
struct mca_bml_base_endpoint_t* endpoint;
|
||
|
int rc;
|
||
|
size_t offset = 0;
|
||
|
|
||
|
/* At this point a determination has to be made whether the
|
||
|
* BFO_HDR_TYPE_MATCH fragment was sent via the sendi interface or
|
||
|
* via the regular send interface. This is important because if it
|
||
|
* was sent via the sendi interface, then the request associated
|
||
|
* with it has already been completed and released. This can be
|
||
|
* determined by looking at the des->des_flags field of the
|
||
|
* descriptor. If the ALWAYS_CALLBACK flag is set then it is known
|
||
|
* that there is a valid send request associated with the fragment
|
||
|
* and it can be used to extricate information. If ALWAYS_CALLBACK
|
||
|
* is not set, then the endpoint information is in the callback
|
||
|
* data field and where to resend the fragment can be determined
|
||
|
* from the fragment. */
|
||
|
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
||
|
endpoint = sendreq->req_endpoint;
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"MATCH: repost: src_req=%p",
|
||
|
(void *)sendreq);
|
||
|
} else {
|
||
|
endpoint = des->des_cbdata;
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"MATCH: repost: des=%p (sendi fragment)",
|
||
|
(void *)des);
|
||
|
}
|
||
|
|
||
|
assert(0 != endpoint->btl_eager.arr_size);
|
||
|
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||
|
|
||
|
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
||
|
/* Reset the converter to the beginning if the message is
|
||
|
* not a zero-length message. In the case of zero-length
|
||
|
* message, the convertor is not being used. */
|
||
|
if (0 != sendreq->req_send.req_bytes_packed) {
|
||
|
opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor,
|
||
|
&offset);
|
||
|
}
|
||
|
rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl);
|
||
|
if (OMPI_SUCCESS == rc) {
|
||
|
return;
|
||
|
} else if (OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"Warning: delaying reposting of BFO_HDR_TYPE_MATCH, btls=%d",
|
||
|
(int)sendreq->req_endpoint->btl_eager.arr_size);
|
||
|
add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true);
|
||
|
return;
|
||
|
} else {
|
||
|
opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH",
|
||
|
__FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
} else {
|
||
|
/* No send request available so alloc and repost explicitly */
|
||
|
mca_btl_base_descriptor_t* newdes = NULL;
|
||
|
mca_btl_base_segment_t* oldseg;
|
||
|
mca_btl_base_segment_t* newseg;
|
||
|
|
||
|
oldseg = des->des_local;
|
||
|
/* The alloc routine must be called with the MCA_BTL_NO_ORDER
|
||
|
* flag so that the allocation routine works. The allocation
|
||
|
* will fill in the order flag in the descriptor. */
|
||
|
mca_bml_base_alloc( bml_btl, &newdes,
|
||
|
MCA_BTL_NO_ORDER,
|
||
|
oldseg->seg_len,
|
||
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||
|
if (OPAL_UNLIKELY(NULL == newdes)) {
|
||
|
opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH",
|
||
|
__FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
newseg = newdes->des_local;
|
||
|
/* Copy over all the data that is actually sent over the wire */
|
||
|
memcpy(newseg->seg_addr.pval, oldseg->seg_addr.pval, oldseg->seg_len);
|
||
|
newseg->seg_len = oldseg->seg_len;
|
||
|
|
||
|
/* This call will either return OMPI_SUCCESS or OMPI_ERROR. The
|
||
|
* OMPI_SUCCESS only says that the send request can be freed.
|
||
|
* It may be that the message was queued up in the BTL. */
|
||
|
rc = mca_bml_base_send(bml_btl, newdes, MCA_PML_BFO_HDR_TYPE_MATCH);
|
||
|
|
||
|
/* Some BTLs will set the CALLBACK flag but we do not want that
|
||
|
* as there is no longer a request associated with this descriptor.
|
||
|
* Therefore, always make sure it is cleared. */
|
||
|
newdes->des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||
|
|
||
|
if( OPAL_LIKELY( rc >= 0 )) {
|
||
|
/* Just let the normal flow of data free whatever needs
|
||
|
* to be freed */
|
||
|
return;
|
||
|
} else {
|
||
|
opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH",
|
||
|
__FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
}
|
||
|
/* No need to free any descriptors. The BTLs take care of it since
|
||
|
* we originally allocated with MCA_BTL_DES_FLAGS_BTL_OWNERSHIP. */
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Completion callback for rndvrestartnotify completion event. If the
|
||
|
* RNDVRESTARTACK has already been received, then reset and restart.
|
||
|
* Otherwise, just update the state and let the RNDVRESTARTACK trigger
|
||
|
* the reset and restart.
|
||
|
*/
|
||
|
void
|
||
|
mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl,
|
||
|
struct mca_btl_base_endpoint_t* ep,
|
||
|
struct mca_btl_base_descriptor_t* des,
|
||
|
int status)
|
||
|
{
|
||
|
mca_pml_bfo_restart_hdr_t* restart;
|
||
|
mca_pml_bfo_send_request_t* sendreq;
|
||
|
|
||
|
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
sendreq = (mca_pml_bfo_send_request_t*) restart->hdr_src_req.pval;
|
||
|
|
||
|
/* Need to resend this message in the case that it fails */
|
||
|
if( OPAL_UNLIKELY((OMPI_SUCCESS != status))) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNOTIFY: completion failed: repost "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq,
|
||
|
(void *)sendreq, sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
/* Repost the message and indicate it is a repost, not a new one. No need
|
||
|
* to check the req_events as this is the only possible outstanding send
|
||
|
* event when we have posted this message. We also know the sendreq is still
|
||
|
* available because nothing can proceed until this completion event happens
|
||
|
* successfully as we track the req_restart value. */
|
||
|
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, true,
|
||
|
MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY,
|
||
|
status, btl);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/* The req_restart value is incremented to indicate completion of
|
||
|
* the RNDVRESTARTNOTIFY message. Then (typically) the arrival of the
|
||
|
* ACK message will cause the request to reset and restart. Need to
|
||
|
* make sure that RNDVRESTARTNOTIFY callback has been called as well as
|
||
|
* the ACK back from the receiver prior to resetting and restarting
|
||
|
* the request. This is needed in case we get an error on the
|
||
|
* RNDVRESTARTNOTIFY message, but it actually makes it over. We want
|
||
|
* to make sure the send request has not restarted yet. So, keep a
|
||
|
* counter that counts to 2. */
|
||
|
sendreq->req_restart++;
|
||
|
if (2 == sendreq->req_restart) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNOTIFY: completion: restarting request "
|
||
|
"PML=%d, RQS=%d, CTX=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid,
|
||
|
sendreq->req_recv.pval, (void *)sendreq,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
mca_pml_bfo_send_request_restart(sendreq, false, 0);
|
||
|
} else {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNOTIFY: completion: waiting for ack "
|
||
|
"PML=%d, RQS=%d, CTX=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid,
|
||
|
sendreq->req_recv.pval, (void *)sendreq,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This function is called when an error is detected on a completion
|
||
|
* event on the receiving side. This can come from a ACK, PUT, RDMA
|
||
|
* read (GET) or RECVERRNOTIFY completion event. When this happens, check
|
||
|
* the state of the request and decide if the sender needs be notified
|
||
|
* that a problem was seen. If no RECVERRNOTIFY message has been sent and
|
||
|
* no RNDVRESTARTNOTIFY has been received from the sender, then send a
|
||
|
* message telling the sender an error was seen.
|
||
|
*/
|
||
|
void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq,
|
||
|
mca_btl_base_tag_t tag, int status)
|
||
|
{
|
||
|
mca_btl_base_descriptor_t* des;
|
||
|
mca_pml_bfo_restart_hdr_t* restart;
|
||
|
ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
|
||
|
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
mca_bml_base_btl_t* bml_btl;
|
||
|
int rc;
|
||
|
|
||
|
assert(0 != bml_endpoint->btl_eager.arr_size);
|
||
|
|
||
|
bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
|
||
|
|
||
|
/* allocate descriptor */
|
||
|
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
|
||
|
sizeof(mca_pml_bfo_restart_hdr_t),
|
||
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||
|
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
|
||
|
if( OPAL_UNLIKELY(NULL == des) ) {
|
||
|
opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
|
||
|
/* fill out header */
|
||
|
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
restart->hdr_match.hdr_common.hdr_flags = 0;
|
||
|
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY;
|
||
|
restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
|
||
|
restart->hdr_match.hdr_src = recvreq->req_recv.req_base.req_comm->c_my_rank;
|
||
|
restart->hdr_match.hdr_seq = (uint16_t)recvreq->req_msgseq;
|
||
|
restart->hdr_restartseq = recvreq->req_restartseq;
|
||
|
restart->hdr_src_req = recvreq->remote_req_send;
|
||
|
restart->hdr_dst_req.pval = recvreq;
|
||
|
|
||
|
bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, proc);
|
||
|
|
||
|
/* initialize descriptor */
|
||
|
des->des_cbfunc = mca_pml_bfo_recv_restart_completion;
|
||
|
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RECVERRNOTIFY: sending to sender, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d, btl=%p",
|
||
|
recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval,
|
||
|
(void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||
|
(void *)bml_btl->btl);
|
||
|
|
||
|
rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY);
|
||
|
if( OPAL_UNLIKELY( rc < 0 ) ) {
|
||
|
opal_output(0, "[%s:%d] Cannot send recverrnotify message", __FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
/* Prevent future error messages on this request */
|
||
|
recvreq->req_errstate |= RECVREQ_RECVERRSENT;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This function is called when it may be time to send a RNDVRESTARTACK
|
||
|
* message back to the sending side. This can happen because we
|
||
|
* received a RNDVRESTARTNOTIFY message from the sender. This can
|
||
|
* also happen if we have noticed that the request has received the
|
||
|
* RNDVRESTARTNOTIFY message, but has not yet sent out the RNDVRESTARTACK
|
||
|
* because there were still some pending receive events on the request.
|
||
|
* That means we can enter this routine from a completion event on a ACK,
|
||
|
* PUT, or RDMA read as well as from the receipt of a RNDVRESTARTNOTIFY
|
||
|
* message. If all is good, we sent the RNDVRESTARTACK message back to
|
||
|
* the sender. Then sometime later a message will arrive telling us
|
||
|
* to reset and restart the receive request.
|
||
|
*/
|
||
|
void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq,
|
||
|
mca_btl_base_tag_t tag, int status,
|
||
|
mca_btl_base_module_t* btl)
|
||
|
{
|
||
|
mca_btl_base_descriptor_t* des;
|
||
|
mca_pml_bfo_restart_hdr_t* restart;
|
||
|
ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
|
||
|
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
mca_bml_base_btl_t* bml_btl;
|
||
|
int rc;
|
||
|
|
||
|
assert((recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) == RECVREQ_RNDVRESTART_RECVED);
|
||
|
assert((recvreq->req_errstate & RECVREQ_RNDVRESTART_ACKED) == 0);
|
||
|
assert(0 != bml_endpoint->btl_eager.arr_size);
|
||
|
|
||
|
bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
|
||
|
|
||
|
/* Attempt to use a different BTL than the error message was
|
||
|
* received on. This may potentially tickle the error sooner if
|
||
|
* this side has not seen it yet. */
|
||
|
if (bml_btl->btl == btl) {
|
||
|
/* If there is more than one BTL left, then we will get a
|
||
|
* different one. If there is only one, we will just get
|
||
|
* the same one back again. That is OK. */
|
||
|
bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
|
||
|
}
|
||
|
|
||
|
/* allocate descriptor */
|
||
|
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
|
||
|
sizeof(mca_pml_bfo_restart_hdr_t),
|
||
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||
|
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
|
||
|
if( OPAL_UNLIKELY(NULL == des) ) {
|
||
|
opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
|
||
|
/* fill out header */
|
||
|
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
restart->hdr_match.hdr_common.hdr_flags = 0;
|
||
|
restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK;
|
||
|
restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid;
|
||
|
restart->hdr_match.hdr_src = recvreq->req_recv.req_base.req_comm->c_my_rank;
|
||
|
restart->hdr_match.hdr_seq = (uint16_t)recvreq->req_msgseq;
|
||
|
restart->hdr_restartseq = recvreq->req_restartseq;
|
||
|
restart->hdr_src_req = recvreq->remote_req_send;
|
||
|
restart->hdr_dst_req.pval = recvreq;
|
||
|
|
||
|
bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK, proc);
|
||
|
|
||
|
/* initialize descriptor */
|
||
|
des->des_cbfunc = mca_pml_bfo_recv_restart_completion;
|
||
|
des->des_cbdata = (void *)proc;
|
||
|
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTACK: due to PML tag=%d completion, sending to "
|
||
|
"sender, PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, "
|
||
|
"peer=%d, btl=%p",
|
||
|
tag, recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq, status,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
|
||
|
(void *)bml_btl->btl);
|
||
|
|
||
|
rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK);
|
||
|
if( OPAL_UNLIKELY( rc < 0 ) ) {
|
||
|
opal_output(0, "[%s:%d] Cannot send rndvrestartack message", __FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
/* Move to the next state so we do not send anymore ACKs */
|
||
|
recvreq->req_errstate &= ~RECVREQ_RNDVRESTART_RECVED;
|
||
|
recvreq->req_errstate |= RECVREQ_RNDVRESTART_ACKED;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Called after the receipt of a RNDVRESTARTNOTIFY message to a request
|
||
|
* that no longer matches. This can happen if the sender detected an
|
||
|
* error, but the receiver actually received all the data. Therefore
|
||
|
* send a NACK back instead of the ACK so that the sender can complete
|
||
|
* its request. This happens very rarely. Note that we need to make
|
||
|
* use of the hdr_dst_rank that we received from the notify message.
|
||
|
* This is so the sending side make sure the message matches a valid
|
||
|
* request on the sending side.
|
||
|
*/
|
||
|
void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes,
|
||
|
ompi_proc_t* ompi_proc, bool repost)
|
||
|
{
|
||
|
mca_btl_base_segment_t* segments;
|
||
|
mca_pml_bfo_restart_hdr_t* hdr; /* hdr of NOTIFY message */
|
||
|
mca_pml_bfo_restart_hdr_t* nack; /* hdr of NACK message */
|
||
|
mca_btl_base_descriptor_t* des;
|
||
|
mca_bml_base_endpoint_t* bml_endpoint;
|
||
|
mca_bml_base_btl_t* bml_btl;
|
||
|
int rc;
|
||
|
|
||
|
if (repost) {
|
||
|
/* In the case where we are reposting the NACK, the information
|
||
|
* is in the src area, since we are reposting a send. In addition,
|
||
|
* we get the ompi_proc from the old descriptor. */
|
||
|
ompi_proc = olddes->des_cbdata;
|
||
|
}
|
||
|
|
||
|
segments = olddes->des_local;
|
||
|
hdr = (mca_pml_bfo_restart_hdr_t*)segments->seg_addr.pval;
|
||
|
|
||
|
bml_endpoint = ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
assert(0 != bml_endpoint->btl_eager.arr_size);
|
||
|
bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
|
||
|
|
||
|
/* allocate descriptor */
|
||
|
mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
|
||
|
sizeof(mca_pml_bfo_restart_hdr_t),
|
||
|
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
|
||
|
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
|
||
|
if( OPAL_UNLIKELY(NULL == des) ) {
|
||
|
opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
|
||
|
/* fill out header */
|
||
|
nack = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
nack->hdr_match.hdr_common.hdr_flags = 0;
|
||
|
nack->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK;
|
||
|
nack->hdr_match.hdr_ctx = hdr->hdr_match.hdr_ctx;
|
||
|
nack->hdr_match.hdr_src = hdr->hdr_dst_rank; /* Receiver rank */
|
||
|
nack->hdr_match.hdr_seq = hdr->hdr_match.hdr_seq;
|
||
|
nack->hdr_restartseq = hdr->hdr_restartseq;
|
||
|
nack->hdr_src_req = hdr->hdr_src_req;
|
||
|
nack->hdr_dst_req.pval = 0;
|
||
|
|
||
|
bfo_hdr_hton(nack, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK, ompi_proc);
|
||
|
|
||
|
/* Initialize descriptor. Save away ompi_proc in case we need
|
||
|
* to respost this fragmnet. */
|
||
|
des->des_cbfunc = mca_pml_bfo_recv_restart_completion;
|
||
|
des->des_cbdata = ompi_proc;
|
||
|
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNACK: sending to sender, "
|
||
|
"PML=%d, RQS=%d, CTX=%d, SRC=%d, peer=%d",
|
||
|
nack->hdr_match.hdr_seq, nack->hdr_restartseq,
|
||
|
nack->hdr_match.hdr_ctx, nack->hdr_match.hdr_src,
|
||
|
OMPI_CAST_RTE_NAME(&ompi_proc->super.proc_name)->vpid);
|
||
|
|
||
|
rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK);
|
||
|
if( OPAL_UNLIKELY( rc < 0 ) ) {
|
||
|
opal_output(0, "[%s:%d] Cannot send rndvrestartnack message", __FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Reset all the receive request fields to match what a request
|
||
|
* looks like when it is first started. This gets called when
|
||
|
* the rendezvous/rget message is being restarted.
|
||
|
*/
|
||
|
void mca_pml_bfo_recv_request_reset(mca_pml_bfo_recv_request_t* match) {
|
||
|
int i;
|
||
|
|
||
|
assert(true != match->req_recv.req_base.req_pml_complete);
|
||
|
|
||
|
/* Free up any resources that were reserved for this receive. This
|
||
|
* was copied from the receive completion code. */
|
||
|
for(i = 0; i < (int)match->req_rdma_cnt; i++) {
|
||
|
mca_mpool_base_registration_t* btl_reg = match->req_rdma[i].btl_reg;
|
||
|
if( NULL != btl_reg && btl_reg->mpool != NULL) {
|
||
|
btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );
|
||
|
}
|
||
|
}
|
||
|
match->req_rdma_cnt = 0;
|
||
|
|
||
|
/* This code is mostly copied from mca_pml_bfo_recv_req_start.
|
||
|
* Note 1: Leave req_bytes_expected as the original value. No
|
||
|
* need to adjust this as it is set when convertor is created.
|
||
|
* Note 2: Leave req_bytes_delivered as the original value.
|
||
|
* This is created when the convertor is created and represents
|
||
|
* the expected bytes from the user. */
|
||
|
assert(0 == match->req_events);
|
||
|
match->req_errstate = 0;
|
||
|
match->req_lock = 0;
|
||
|
match->req_pipeline_depth = 0;
|
||
|
match->req_bytes_received = 0;
|
||
|
match->req_rdma_idx = 0;
|
||
|
match->req_rdma_offset = 0;
|
||
|
match->req_send_offset = 0;
|
||
|
match->req_pending = false;
|
||
|
match->req_ack_sent = false;
|
||
|
match->req_restartseq++;
|
||
|
|
||
|
/* These really should not need to be set, but this matches some
|
||
|
* of the initialization within MCA_PML_BASE_RECV_START. */
|
||
|
match->req_recv.req_base.req_pml_complete = false;
|
||
|
match->req_recv.req_base.req_ompi.req_complete = false;
|
||
|
match->req_recv.req_base.req_ompi.req_state = OMPI_REQUEST_ACTIVE;
|
||
|
|
||
|
/* Reset the convertor */
|
||
|
opal_convertor_set_position(&match->req_recv.req_base.req_convertor,
|
||
|
&match->req_rdma_offset);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Completion callback for RNDVRESTARTACK, RNDVRESTARTNACK and RECVERRNOTIFY.
|
||
|
*/
|
||
|
void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl,
|
||
|
struct mca_btl_base_endpoint_t* ep,
|
||
|
struct mca_btl_base_descriptor_t* des,
|
||
|
int status )
|
||
|
{
|
||
|
if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
|
||
|
mca_pml_bfo_common_hdr_t* common = des->des_local->seg_addr.pval;
|
||
|
mca_pml_bfo_restart_hdr_t* restart; /* RESTART header */
|
||
|
mca_pml_bfo_recv_request_t* recvreq;
|
||
|
|
||
|
switch (common->hdr_type) {
|
||
|
case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK:
|
||
|
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval;
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTACK: completion failed: try again "
|
||
|
"PML:req=%d,hdr=%d RQS:req=%d,hdr=%d CTX:req=%d,hdr=%d "
|
||
|
"src_req=%p, dst_req=%p, peer=%d",
|
||
|
recvreq->req_msgseq, restart->hdr_match.hdr_seq,
|
||
|
recvreq->req_restartseq, restart->hdr_restartseq,
|
||
|
recvreq->req_recv.req_base.req_comm->c_contextid,
|
||
|
restart->hdr_match.hdr_ctx,
|
||
|
recvreq->remote_req_send.pval,
|
||
|
(void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
|
||
|
/* Adjust the states back to avoid assert errors */
|
||
|
recvreq->req_errstate &= ~RECVREQ_RNDVRESTART_ACKED;
|
||
|
recvreq->req_errstate |= RECVREQ_RNDVRESTART_RECVED;
|
||
|
mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK,
|
||
|
status, btl);
|
||
|
break;
|
||
|
case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK:
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RNDVRESTARTNACK: completion failed: try again "
|
||
|
"des=%p ", (void *)des);
|
||
|
/* Just blast it again. No request associated with it. */
|
||
|
mca_pml_bfo_recv_request_rndvrestartnack(des, NULL, true);
|
||
|
break;
|
||
|
case MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY:
|
||
|
restart = (mca_pml_bfo_restart_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval;
|
||
|
/* With just two BTLs, this should never happen as we are
|
||
|
* typically sending the RECVERRNOTIFY message on the
|
||
|
* working BTL. But, just in case, if we get an error,
|
||
|
* send it again. */
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RECVERRNOTIFY: completion failed: try again, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval,
|
||
|
(void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY,
|
||
|
status);
|
||
|
break;
|
||
|
default:
|
||
|
opal_output(0, "[%s:%d] Unknown callback error", __FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Remove a btl for future communication on an endpoint.
|
||
|
*/
|
||
|
void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl,
|
||
|
ompi_proc_t *errproc, char *btlname)
|
||
|
{
|
||
|
mca_bml_base_endpoint_t* ep;
|
||
|
bool remove = false;
|
||
|
int i;
|
||
|
|
||
|
ep = (mca_bml_base_endpoint_t*)errproc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
|
||
|
/* The bml_del_proc_btl function does not indicate if it
|
||
|
* actually removed a btl, so let me check up front. This is
|
||
|
* done so that we can only print out messages when a btl is
|
||
|
* actually going to be removed. These arrays are small so it
|
||
|
* is OK to walk through all of them even though it may be
|
||
|
* redundant. */
|
||
|
for( i = 0; i < (int)ep->btl_eager.arr_size; i++ ) {
|
||
|
if( ep->btl_eager.bml_btls[i].btl == btl ) {
|
||
|
remove = true;
|
||
|
}
|
||
|
}
|
||
|
for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
|
||
|
if( ep->btl_send.bml_btls[i].btl == btl ) {
|
||
|
remove = true;
|
||
|
}
|
||
|
}
|
||
|
for( i = 0; i < (int)ep->btl_rdma.arr_size; i++ ) {
|
||
|
if( ep->btl_rdma.bml_btls[i].btl == btl ) {
|
||
|
remove = true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (true == remove) {
|
||
|
mca_bml.bml_del_proc_btl(errproc, btl);
|
||
|
|
||
|
opal_output_verbose(10, mca_pml_bfo_output,
|
||
|
"BTL %s error: rank=%d mapping out %s "
|
||
|
"to rank=%d on node=%s \n",
|
||
|
btl->btl_component->btl_version.mca_component_name,
|
||
|
OMPI_PROC_MY_NAME->vpid,
|
||
|
btlname, OMPI_CAST_RTE_NAME(&errproc->super.proc_name)->vpid,
|
||
|
(NULL == errproc->super.proc_hostname) ? "unknown" : errproc->super.proc_hostname);
|
||
|
|
||
|
/* Need to search for any pending packets associated
|
||
|
* with this endpoint and remove them. We may also
|
||
|
* have to restarts depending on the state of the
|
||
|
* requests. */
|
||
|
mca_pml_bfo_error_pending_packets(btl, ep);
|
||
|
|
||
|
if ((ep->btl_eager.arr_size == 0) &&
|
||
|
(ep->btl_send.arr_size == 0) &&
|
||
|
(ep->btl_rdma.arr_size == 0)) {
|
||
|
opal_output(0, "%s:%d: No more interfaces, aborting",
|
||
|
__FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void mca_pml_bfo_failover_error_handler(struct mca_btl_base_module_t* btl,
|
||
|
int32_t flags, ompi_proc_t *errproc, char *btlname)
|
||
|
{
|
||
|
ompi_proc_t** procs;
|
||
|
size_t p, num_procs;
|
||
|
|
||
|
/* If we are in here, we know that the we were called
|
||
|
* with the flags == MCA_BTL_ERROR_FLAGS_NONFATAL so no
|
||
|
* need to check it in here. */
|
||
|
assert(flags & MCA_BTL_ERROR_FLAGS_NONFATAL);
|
||
|
|
||
|
procs = ompi_proc_all(&num_procs);
|
||
|
|
||
|
if(NULL == procs) {
|
||
|
opal_output(0, "%s:%d: Out of memory, giving up.",
|
||
|
__FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
|
||
|
if (NULL == btlname) {
|
||
|
btlname = "unknown";
|
||
|
}
|
||
|
|
||
|
/* If the process to map out is not specified then map out the
|
||
|
* entire BTL. Otherwise, only map out the BTL for the specific
|
||
|
* remote process. */
|
||
|
if (NULL == errproc) {
|
||
|
for( p = 0; p < num_procs; p++ ) {
|
||
|
mca_pml_bfo_map_out_btl(btl, procs[p], btlname);
|
||
|
}
|
||
|
} else {
|
||
|
mca_pml_bfo_map_out_btl(btl, errproc, btlname);
|
||
|
}
|
||
|
free(procs);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* This function is called since when we are mapping out a BML. This
|
||
|
* will walk through the four PML lists and dispatch with the
|
||
|
* fragments/requests. There are four different lists and each one is
|
||
|
* handled slighty differently. In all cases, we first see if the
|
||
|
* message is associated with the endpoint that is being mapped out.
|
||
|
* If not, then just leave it alone and put it back on the list. If
|
||
|
* it is associated with the endpoint, then a each list handles it
|
||
|
* slighlty differently. Also, in some cases, we actually adjust the
|
||
|
* pointers to the BMLs in the messages as they may have changed when
|
||
|
* the BML is mapped out. That is because this is called after we
|
||
|
* have mapped out the offending BML and adjusted the array of
|
||
|
* available BMLs.
|
||
|
*/
|
||
|
static void mca_pml_bfo_error_pending_packets(mca_btl_base_module_t* btl,
|
||
|
mca_bml_base_endpoint_t* ep) {
|
||
|
int32_t i, s;
|
||
|
|
||
|
/* The pckt_pending list contains both ACK and FIN messages.
|
||
|
* ACKs can be sent over any BTL associated with the endpoint.
|
||
|
* Therefore, the bml_btl entry for ACKS is NULL and they do
|
||
|
* not need to be adjusted. It is also worth noting that
|
||
|
* the ACK will be the only outstanding message associated
|
||
|
* with a request so we can just let nature takes it course.
|
||
|
*
|
||
|
* FIN messages do have a BML associated with them, but they
|
||
|
* can also be sent over any BTL. Therefore, adjust the bml
|
||
|
* pointer in the pckt to ensure it points at a valid BML.
|
||
|
*/
|
||
|
|
||
|
s = (int32_t)opal_list_get_size(&mca_pml_bfo.pckt_pending);
|
||
|
for(i = 0; i < s; i++) {
|
||
|
mca_pml_bfo_pckt_pending_t *pckt;
|
||
|
opal_output_verbose(0, mca_pml_bfo_output,
|
||
|
"INFO: pckt_pending list has %d entries", s);
|
||
|
#if 1
|
||
|
/* TODO: Error out until code is tested */
|
||
|
opal_output_verbose(0, mca_pml_bfo_output,
|
||
|
"%s:%d: Support not implemented, aborting",
|
||
|
__FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
#endif
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
pckt = (mca_pml_bfo_pckt_pending_t*)
|
||
|
opal_list_remove_first(&mca_pml_bfo.pckt_pending);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
|
||
|
/* My guess is that this can happen in the threaded
|
||
|
* case where the other thread removed some packets
|
||
|
* after we determined the size of the list. */
|
||
|
if(NULL == pckt)
|
||
|
break;
|
||
|
|
||
|
/* If there is no bml stored on the packet, then just
|
||
|
* put it back on the list as there is nothing to adjust.
|
||
|
* This appears to be true with ACK packets. */
|
||
|
if (NULL == pckt->bml_btl) {
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
opal_list_append(&mca_pml_bfo.pckt_pending,
|
||
|
(opal_list_item_t*)pckt);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
/* Now see if this endpoint matches the one we are mapping
|
||
|
* out. If so, adjust the bml entry so to ensure it is
|
||
|
* not pointing at a stale bml. We do not really care
|
||
|
* which BML it is pointing at as long as it is valid.
|
||
|
* In either case, then put entry back on the list. */
|
||
|
if (pckt->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] == ep) {
|
||
|
opal_output_verbose(15, mca_pml_bfo_output,
|
||
|
"INFO: Found matching pckt on pckt_pending list, adjusting bml");
|
||
|
pckt->bml_btl = mca_bml_base_btl_array_get_next(&ep->btl_eager);
|
||
|
}
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
opal_list_append(&mca_pml_bfo.pckt_pending,
|
||
|
(opal_list_item_t*)pckt);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
|
||
|
}
|
||
|
|
||
|
/* This next list holds rdma fragments. We need to walk through
|
||
|
* the list and see if any are associated with the endpoint
|
||
|
* we are mapping out. If not, then just put back on the
|
||
|
* list. If they are, then we need to error them out. One issue
|
||
|
* is that we need to deal with the case where there may be more
|
||
|
* then one pending rdma fragment for a request. */
|
||
|
s = (int32_t)opal_list_get_size(&mca_pml_bfo.rdma_pending);
|
||
|
for(i = 0; i < s; i++) {
|
||
|
mca_pml_bfo_rdma_frag_t* frag;
|
||
|
mca_pml_bfo_send_request_t* sendreq;
|
||
|
mca_pml_bfo_recv_request_t* recvreq;
|
||
|
opal_output_verbose(0, mca_pml_bfo_output,
|
||
|
"INFO: rdma_pending list has %d entries", s);
|
||
|
#if 1
|
||
|
/* TODO: Error out until code is tested */
|
||
|
opal_output_verbose(0, mca_pml_bfo_output,
|
||
|
"%s:%d: Support not implemented, aborting",
|
||
|
__FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
#endif
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
frag = (mca_pml_bfo_rdma_frag_t*)
|
||
|
opal_list_remove_first(&mca_pml_bfo.rdma_pending);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
|
||
|
/* My guess is that this can happen in the threaded
|
||
|
* case where the other thread removed some packets
|
||
|
* after we determined the size of the list. */
|
||
|
if(NULL == frag)
|
||
|
break;
|
||
|
|
||
|
/* Check to see if it matches our endpoint. If it does,
|
||
|
* then check if it matches the BTL that is being mapped
|
||
|
* out. If it does not, then just readjust the BML pointer.
|
||
|
* If it does, then we need to do something with it. */
|
||
|
if (frag->rdma_ep != ep) {
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
opal_list_append(&mca_pml_bfo.rdma_pending,
|
||
|
(opal_list_item_t*)frag);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
/* If we are here, then we know we are working on the same
|
||
|
* endpoint. Now check the BTL. */
|
||
|
if (frag->rdma_btl != btl) {
|
||
|
opal_output_verbose(15, mca_pml_bfo_output,
|
||
|
"INFO: Found matching frag on rdma_pending list, adjusting bml");
|
||
|
/* The BTL this RDMA is associated with is not the
|
||
|
* one that is getting mapped out, so just adjust the
|
||
|
* BML pointer and put back on the list. */
|
||
|
frag->rdma_bml = mca_bml_base_btl_array_find(&ep->btl_rdma, frag->rdma_btl);
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
opal_list_append(&mca_pml_bfo.rdma_pending,
|
||
|
(opal_list_item_t*)frag);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
/* Now we call the restart routine. This is just like if we got
|
||
|
* a completion event after calling an RDMA write. This will
|
||
|
* take care of figuring out if we need to restart the request
|
||
|
* or wait for any outstanding events to complete. */
|
||
|
if(frag->rdma_state == MCA_PML_BFO_RDMA_PUT) {
|
||
|
opal_output_verbose(15, mca_pml_bfo_output,
|
||
|
"INFO: Found matching PUT frag on rdma_pending list, restarting");
|
||
|
sendreq = frag->rdma_req;
|
||
|
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
|
||
|
MCA_PML_BFO_HDR_TYPE_PUT, 2, btl);
|
||
|
MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
|
||
|
} else {
|
||
|
opal_output_verbose(15, mca_pml_bfo_output,
|
||
|
"INFO: Found matching RGET frag on rdma_pending list, sending reqerror");
|
||
|
/* This is just like what we do on an rget completion event */
|
||
|
recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req;
|
||
|
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, 2);
|
||
|
|
||
|
/* See if the request has received a RNDVRESTARTNOTIFY */
|
||
|
if( OPAL_UNLIKELY(recvreq->req_errstate)) {
|
||
|
if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) {
|
||
|
mca_pml_bfo_recv_request_rndvrestartack(recvreq,
|
||
|
MCA_PML_BFO_HDR_TYPE_RGET,
|
||
|
2, btl);
|
||
|
}
|
||
|
}
|
||
|
MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
s = opal_list_get_size(&mca_pml_bfo.send_pending);
|
||
|
/* Look for pending events on our endpoint */
|
||
|
for(i = 0; i < s; i++) {
|
||
|
mca_pml_bfo_send_request_t* sendreq;
|
||
|
ompi_proc_t* proc;
|
||
|
mca_bml_base_endpoint_t* bml_endpoint;
|
||
|
opal_output_verbose(0, mca_pml_bfo_output,
|
||
|
"INFO: send_pending list has %d entries", s);
|
||
|
#if 1
|
||
|
/* TODO: Error out until code is tested */
|
||
|
opal_output_verbose(0, mca_pml_bfo_output,
|
||
|
"%s:%d: Support not implemented, aborting",
|
||
|
__FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
#endif
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
sendreq = (mca_pml_bfo_send_request_t*)
|
||
|
opal_list_remove_first(&mca_pml_bfo.send_pending);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
|
||
|
/* My guess is that this can happen in the threaded
|
||
|
* case where the other thread removed some packets
|
||
|
* after we determined the size of the list. */
|
||
|
if(NULL == sendreq)
|
||
|
break;
|
||
|
|
||
|
proc = (ompi_proc_t*)sendreq->req_send.req_base.req_proc;
|
||
|
bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
|
||
|
/* Check to see if it matches our endpoint. If it does not,
|
||
|
* then just put it back on the list as there is nothing
|
||
|
* we need to do with it. */
|
||
|
if (bml_endpoint != ep) {
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
opal_list_append(&mca_pml_bfo.send_pending,
|
||
|
(opal_list_item_t*)sendreq);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
switch(sendreq->req_pending) {
|
||
|
case MCA_PML_BFO_SEND_PENDING_SCHEDULE:
|
||
|
/* If this send request is using the endpoint that received
|
||
|
* the error, then let us error it out. In the case
|
||
|
* where there is only one fragment left to be scheduled
|
||
|
* and it would have gone over the good BTL, this is
|
||
|
* not necessary. But, we will use simplicity here
|
||
|
* and assume that some of the fragments are still
|
||
|
* scheduled to go over the broken BTL. */
|
||
|
sendreq->req_error++;
|
||
|
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
|
||
|
MCA_PML_BFO_HDR_TYPE_FRAG, 2, btl);
|
||
|
break;
|
||
|
case MCA_PML_BFO_SEND_PENDING_START:
|
||
|
/* If the request has not even started, then just put it back
|
||
|
* on the list. Nothing else to do with it. */
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
opal_list_append(&mca_pml_bfo.send_pending,
|
||
|
(opal_list_item_t*)sendreq);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
break;
|
||
|
default:
|
||
|
opal_output(0, "[%s:%d] wrong send request type\n",
|
||
|
__FILE__, __LINE__);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
s = (int)opal_list_get_size(&mca_pml_bfo.recv_pending);
|
||
|
for(i = 0; i < s; i++) {
|
||
|
mca_pml_bfo_recv_request_t* recvreq;
|
||
|
ompi_proc_t* proc;
|
||
|
mca_bml_base_endpoint_t* bml_endpoint;
|
||
|
opal_output_verbose(0, mca_pml_bfo_output,
|
||
|
"INFO: recv_pending list has %d entries", s);
|
||
|
#if 1
|
||
|
/* TODO: Error out until code is tested */
|
||
|
opal_output_verbose(0, mca_pml_bfo_output,
|
||
|
"%s:%d: Support not implemented, aborting",
|
||
|
__FILE__, __LINE__);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
#endif
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
recvreq = (mca_pml_bfo_recv_request_t*)
|
||
|
opal_list_remove_first(&mca_pml_bfo.recv_pending);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
|
||
|
/* My guess is that this can happen in the threaded
|
||
|
* case where the other thread removed some packets
|
||
|
* after we determined the size of the list. */
|
||
|
if(NULL == recvreq)
|
||
|
break;
|
||
|
|
||
|
proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
|
||
|
bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
|
||
|
if (bml_endpoint != ep) {
|
||
|
OPAL_THREAD_LOCK(&mca_pml_bfo.lock);
|
||
|
opal_list_append(&mca_pml_bfo.recv_pending,
|
||
|
(opal_list_item_t*)recvreq);
|
||
|
OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock);
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, 2);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Call each time we get a completion event on ACK or PUT message.
|
||
|
* These types of messages are receive control type messages. This
|
||
|
* function is only called if the underlying BTL supports failover.
|
||
|
* Otherwise, there is no need for this check.
|
||
|
*/
|
||
|
void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
|
||
|
struct mca_btl_base_descriptor_t* des,
|
||
|
int status)
|
||
|
{
|
||
|
mca_pml_bfo_common_hdr_t * common = des->des_local->seg_addr.pval;
|
||
|
mca_pml_bfo_rdma_hdr_t* hdr; /* PUT header */
|
||
|
struct mca_btl_base_descriptor_t* rdma_des;
|
||
|
mca_pml_bfo_recv_request_t* recvreq;
|
||
|
|
||
|
if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
|
||
|
switch (common->hdr_type) {
|
||
|
case MCA_PML_BFO_HDR_TYPE_ACK:
|
||
|
recvreq = des->des_cbdata;
|
||
|
|
||
|
/* Record the error. Send RECVERRNOTIFY if necessary. */
|
||
|
if (recvreq->req_errstate) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"ACK: completion failed, error already seen, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
} else {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"ACK: completion failed, sending RECVERRNOTIFY to sender, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_ACK, status);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case MCA_PML_BFO_HDR_TYPE_PUT:
|
||
|
hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
rdma_des = hdr->hdr_des.pval;
|
||
|
recvreq = des->des_cbdata;
|
||
|
if ((NULL != rdma_des->des_cbdata) && (recvreq == rdma_des->des_cbdata)) {
|
||
|
/* We now record the error, send the RECVERRNOTIFY if
|
||
|
* necessary, and free the descriptor. Prior to this,
|
||
|
* we want to ensure that we have not reached the case
|
||
|
* where the PUT message actually made it over and we
|
||
|
* have already received a FIN back. We first check to
|
||
|
* see if the RDMA descriptor cbdata is pointing to
|
||
|
* NULL. If it is, this means that the PUT message must
|
||
|
* have made it over and a corresponding FIN already
|
||
|
* made it back and freed the RDMA descriptor. Second,
|
||
|
* if it is non-null, we make sure that it is pointing
|
||
|
* to the same request as the PUT descriptor is. If
|
||
|
* it is not, again we assume that the FIN came back
|
||
|
* and freed it. And we can count on the fact that the
|
||
|
* recvreq has not been freed or reused as it is held
|
||
|
* until this very completion event occurs. */
|
||
|
if (recvreq->req_errstate) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"PUT: completion failed, error already seen, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
} else {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"PUT: completion failed, sending RECVERRNOTIFY to sender, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, status);
|
||
|
}
|
||
|
#if 0
|
||
|
/* TODO: Add descriptor to receive request so it can
|
||
|
* be freed only when receive request is freed and
|
||
|
* only if needed. */
|
||
|
btl->btl_free(btl, rdma_des);
|
||
|
#endif
|
||
|
}
|
||
|
break;
|
||
|
default:
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
switch (common->hdr_type) {
|
||
|
case MCA_PML_BFO_HDR_TYPE_ACK:
|
||
|
recvreq = des->des_cbdata;
|
||
|
recvreq->req_events--;
|
||
|
assert(recvreq->req_events >= 0);
|
||
|
if(OPAL_UNLIKELY (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED)) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"ACK: completion: recvreq in error, outstanding events=%d "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
|
||
|
recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq, status,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
if (0 == recvreq->req_events) {
|
||
|
mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_ACK,
|
||
|
status, btl);
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
recv_request_pml_complete_check(recvreq);
|
||
|
break;
|
||
|
case MCA_PML_BFO_HDR_TYPE_PUT:
|
||
|
recvreq = des->des_cbdata;
|
||
|
recvreq->req_events--;
|
||
|
assert(recvreq->req_events >= 0);
|
||
|
if(OPAL_UNLIKELY(recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED)) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"PUT: completion: recvreq in error, outstanding events=%d "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
|
||
|
recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq, status,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
if (0 == recvreq->req_events) {
|
||
|
mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_PUT,
|
||
|
status, btl);
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
recv_request_pml_complete_check(recvreq);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Register four functions to handle extra PML message types that
|
||
|
* are utilized when a failover occurs.
|
||
|
*/
|
||
|
int mca_pml_bfo_register_callbacks(void) {
|
||
|
int rc;
|
||
|
/* The following four functions are utilized when failover
|
||
|
* support for openib is enabled. */
|
||
|
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY,
|
||
|
mca_pml_bfo_recv_frag_callback_rndvrestartnotify,
|
||
|
NULL );
|
||
|
if(OMPI_SUCCESS != rc)
|
||
|
return rc;
|
||
|
|
||
|
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK,
|
||
|
mca_pml_bfo_recv_frag_callback_rndvrestartack,
|
||
|
NULL );
|
||
|
if(OMPI_SUCCESS != rc)
|
||
|
return rc;
|
||
|
|
||
|
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK,
|
||
|
mca_pml_bfo_recv_frag_callback_rndvrestartnack,
|
||
|
NULL );
|
||
|
if(OMPI_SUCCESS != rc)
|
||
|
return rc;
|
||
|
|
||
|
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY,
|
||
|
mca_pml_bfo_recv_frag_callback_recverrnotify,
|
||
|
NULL );
|
||
|
if(OMPI_SUCCESS != rc)
|
||
|
return rc;
|
||
|
|
||
|
return rc;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Update a few fields when we are restarting either a RNDV or
|
||
|
* RGET type message.
|
||
|
*/
|
||
|
void mca_pml_bfo_update_rndv_fields(mca_pml_bfo_hdr_t* hdr,
|
||
|
mca_pml_bfo_send_request_t* sendreq, char *type)
|
||
|
{
|
||
|
hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART;
|
||
|
hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv;
|
||
|
hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq;
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"%s: restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, "
|
||
|
"src_req=%p, dst_req=%p, peer=%d",
|
||
|
type, (uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid,
|
||
|
sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq,
|
||
|
sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* The following set of functions are all called when it is determined
|
||
|
* that the cached bml_btl->btl does not match the btl handed back
|
||
|
* by the callback function. This means that the bml_btl array has
|
||
|
* been shuffled and the bml_btl matching the btl has to be found
|
||
|
* back. If it cannot be found, then just find a different one to
|
||
|
* use.
|
||
|
*/
|
||
|
void mca_pml_bfo_update_eager_bml_btl_recv_ctl(mca_bml_base_btl_t** bml_btl,
|
||
|
mca_btl_base_module_t* btl,
|
||
|
struct mca_btl_base_descriptor_t* des)
|
||
|
{
|
||
|
if ((*bml_btl)->btl != btl) {
|
||
|
mca_pml_bfo_common_hdr_t * common = des->des_local->seg_addr.pval;
|
||
|
mca_pml_bfo_ack_hdr_t* ack; /* ACK header */
|
||
|
mca_pml_bfo_recv_request_t* recvreq = NULL;
|
||
|
char *type = NULL;
|
||
|
|
||
|
switch (common->hdr_type) {
|
||
|
case MCA_PML_BFO_HDR_TYPE_ACK:
|
||
|
ack = (mca_pml_bfo_ack_hdr_t*)des->des_local->seg_addr.pval;
|
||
|
recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval;
|
||
|
type = "ACK";
|
||
|
break;
|
||
|
case MCA_PML_BFO_HDR_TYPE_PUT:
|
||
|
recvreq = des->des_cbdata;
|
||
|
type = "PUT";
|
||
|
break;
|
||
|
default:
|
||
|
/* In theory, this can never happen. */
|
||
|
opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)",
|
||
|
__FILE__, __LINE__, common->hdr_type);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
|
||
|
mca_pml_bfo_find_recvreq_eager_bml_btl(bml_btl, btl, recvreq, type);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void mca_pml_bfo_find_sendreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl,
|
||
|
mca_btl_base_module_t* btl,
|
||
|
mca_pml_bfo_send_request_t* sendreq,
|
||
|
char* type)
|
||
|
{
|
||
|
if ((*bml_btl)->btl != btl) {
|
||
|
opal_output_verbose(25, mca_pml_bfo_output,
|
||
|
"%s completion: BML does not match BTL, find it back, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
type, (uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq, (void *)sendreq,
|
||
|
sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
*bml_btl = mca_bml_base_btl_array_find(&sendreq->req_endpoint->btl_eager, btl);
|
||
|
if (NULL == *bml_btl) {
|
||
|
opal_output_verbose(25, mca_pml_bfo_output,
|
||
|
"%s completion: BML is gone, find another one, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
type, (uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq, (void *)sendreq,
|
||
|
sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
*bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->btl_eager);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void mca_pml_bfo_find_sendreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
|
||
|
mca_btl_base_module_t* btl,
|
||
|
mca_pml_bfo_send_request_t* sendreq,
|
||
|
char* type)
|
||
|
{
|
||
|
if ((*bml_btl)->btl != btl) {
|
||
|
opal_output_verbose(25, mca_pml_bfo_output,
|
||
|
"%s completion: BML does not match BTL, find it back, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
type, (uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq, (void *)sendreq,
|
||
|
sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
*bml_btl = mca_bml_base_btl_array_find(&sendreq->req_endpoint->btl_rdma, btl);
|
||
|
if (NULL == *bml_btl) {
|
||
|
opal_output_verbose(25, mca_pml_bfo_output,
|
||
|
"%s completion: BML is gone, find another one, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
type, (uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq, (void *)sendreq,
|
||
|
sendreq->req_recv.pval,
|
||
|
sendreq->req_send.req_base.req_peer);
|
||
|
*bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->btl_rdma);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void mca_pml_bfo_find_recvreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl,
|
||
|
mca_btl_base_module_t* btl,
|
||
|
mca_pml_bfo_recv_request_t* recvreq,
|
||
|
char* type)
|
||
|
{
|
||
|
if ((*bml_btl)->btl != btl) {
|
||
|
ompi_proc_t *proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
|
||
|
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
|
||
|
opal_output_verbose(25, mca_pml_bfo_output,
|
||
|
"%s completion: BML does not match BTL, find it back, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
type, recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
|
||
|
*bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, btl);
|
||
|
if (NULL == *bml_btl) {
|
||
|
opal_output_verbose(25, mca_pml_bfo_output,
|
||
|
"%s completion: BML is gone, find another one, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
type, recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
|
||
|
*bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
|
||
|
mca_btl_base_module_t* btl,
|
||
|
mca_pml_bfo_recv_request_t* recvreq,
|
||
|
char* type)
|
||
|
{
|
||
|
if ((*bml_btl)->btl != btl) {
|
||
|
ompi_proc_t *proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc;
|
||
|
mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
|
||
|
|
||
|
opal_output_verbose(25, mca_pml_bfo_output,
|
||
|
"%s completion: BML does not match BTL, find it back, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
type, recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
|
||
|
*bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
|
||
|
if (NULL == *bml_btl) {
|
||
|
opal_output_verbose(25, mca_pml_bfo_output,
|
||
|
"%s completion: BML is gone, find another one, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
|
||
|
type, recvreq->req_msgseq, recvreq->req_restartseq,
|
||
|
recvreq->remote_req_send.pval, (void *)recvreq,
|
||
|
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
|
||
|
|
||
|
*bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* The completion event for the RNDV message has returned with an
|
||
|
* error. We know that the send request we are looking at is valid
|
||
|
* because it cannot be completed until the sendreq->req_state value
|
||
|
* reaches 0. And for the sendreq->req_state to reach 0, the
|
||
|
* completion event on the RNDV message must occur. So, we do not
|
||
|
* bother checking whether the send request is valid, because we know
|
||
|
* it is, but we put a few asserts in for good measure. We then check
|
||
|
* a few fields in the request to decide what to do. If the
|
||
|
* sendreq->req_error is set, that means that something has happend
|
||
|
* already to the request and we do not want to restart it.
|
||
|
* Presumably, we may have received a RECVERRNOTIFY message from the
|
||
|
* receiver. We also check the sendreq->req_acked field to see if it
|
||
|
* has been acked. If it has, then again we do not restart everything
|
||
|
* because obviously the RNDV message has made it to the other side.
|
||
|
*/
|
||
|
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
|
||
|
mca_pml_bfo_send_request_t* sendreq)
|
||
|
{
|
||
|
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_ctx ==
|
||
|
(sendreq)->req_send.req_base.req_comm->c_contextid);
|
||
|
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_src ==
|
||
|
(sendreq)->req_send.req_base.req_comm->c_my_rank);
|
||
|
assert(((mca_pml_bfo_hdr_t*)((des)->des_local->seg_addr.pval))->hdr_match.hdr_seq ==
|
||
|
(uint16_t)(sendreq)->req_send.req_base.req_sequence);
|
||
|
if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) {
|
||
|
(sendreq)->req_events--;
|
||
|
/* Assume RNDV did not make it, so restart from the beginning. */
|
||
|
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDV);
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Check to see if an error has occurred on this send request. If it has
|
||
|
* and there are no outstanding events, then we can start the restart dance.
|
||
|
*/
|
||
|
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendreq,
|
||
|
int status,
|
||
|
mca_btl_base_module_t* btl,
|
||
|
int type,
|
||
|
char *description)
|
||
|
{
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"%s: completion: sendreq has error, outstanding events=%d, "
|
||
|
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
|
||
|
description,
|
||
|
sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
sendreq->req_restartseq, (void *)sendreq,
|
||
|
sendreq->req_recv.pval,
|
||
|
status, sendreq->req_send.req_base.req_peer);
|
||
|
if (0 == sendreq->req_events) {
|
||
|
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
|
||
|
type, status, btl);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* If we get an error on the RGET message, then first make sure that
|
||
|
* header matches the send request that we are pointing to. This is
|
||
|
* necessary, because even though the sending side got an error, the
|
||
|
* RGET may have made it to the receiving side and the message transfer
|
||
|
* may have completed. This would then mean the send request has been
|
||
|
* completed and perhaps in use by another communication. So there is
|
||
|
* no need to restart this request. Therefore, ensure that we are
|
||
|
* looking at the same request that the header thinks we are looking
|
||
|
* at. If not, then there is nothing else to be done. */
|
||
|
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des)
|
||
|
{
|
||
|
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
|
||
|
mca_pml_bfo_hdr_t* hdr = des->des_local->seg_addr.pval;
|
||
|
switch (hdr->hdr_common.hdr_type) {
|
||
|
case MCA_PML_BFO_HDR_TYPE_RGET:
|
||
|
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
|
||
|
(hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_comm->c_my_rank) ||
|
||
|
(hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) {
|
||
|
opal_output_verbose(30, mca_pml_bfo_output,
|
||
|
"RGET: completion event: dropping because no valid request "
|
||
|
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
|
||
|
"RQS:exp=%d,act=%d, dst_req=%p",
|
||
|
(uint16_t)sendreq->req_send.req_base.req_sequence,
|
||
|
hdr->hdr_match.hdr_seq,
|
||
|
sendreq->req_send.req_base.req_comm->c_contextid,
|
||
|
hdr->hdr_match.hdr_ctx,
|
||
|
sendreq->req_send.req_base.req_comm->c_my_rank,
|
||
|
hdr->hdr_match.hdr_src,
|
||
|
sendreq->req_restartseq, hdr->hdr_rndv.hdr_restartseq,
|
||
|
(void *)sendreq);
|
||
|
return;
|
||
|
}
|
||
|
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RGET);
|
||
|
return;
|
||
|
default:
|
||
|
opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)",
|
||
|
__FILE__, __LINE__, hdr->hdr_common.hdr_type);
|
||
|
ompi_rte_abort(-1, NULL);
|
||
|
}
|
||
|
}
|