/* * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** * @file * Functions that implement failover capabilities. To utilize the * failover feature, one needs to configure the library with * --enable-openib-failover. Then the system that is being used * must have two or more openib BTLs in use. When an error occurs, * the BTL will call into this PML to map out the offending BTL and * continue using the one that is still working. * Most of the differences between the ob1 PML and the bfo PML are * contained in this file. */ #include "ompi_config.h" #include #include #include "opal/class/opal_bitmap.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/base/base.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/pml/base/base.h" #include "ompi/mca/btl/base/base.h" #include "pml_bfo.h" #include "pml_bfo_component.h" #include "pml_bfo_comm.h" #include "pml_bfo_hdr.h" #include "pml_bfo_recvfrag.h" #include "pml_bfo_sendreq.h" #include "pml_bfo_recvreq.h" #include "pml_bfo_rdmafrag.h" #include "pml_bfo_failover.h" #include "ompi/mca/bml/base/base.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/util/show_help.h" #include "orte/mca/notifier/notifier.h" #include "ompi/runtime/ompi_cr.h" static void mca_pml_bfo_error_pending_packets(mca_btl_base_module_t* btl, mca_bml_base_endpoint_t* ep); /** * When running with failover enabled, check the PML sequence numbers * to see if we have received a duplicate message. This check is done * for for all MATCH fragments. It is also done for RNDV and RGET * fragments that do not have the MCA_PML_BFO_HDR_FLAGS_RESTART flag * set. * We set the window size to half the total range of sequence numbers. * We only enter this code when the seq_num is not the expected one. * A few more notes on the algorithm used here. In normal operation, * the expected value will either be equal to or less than the * sequence number of the header. This is because we are using this * sequence number to detect packets arriving prior to them being * expected. If we determine that expected is less than header, then * make sure this is not a rollover case. We do that by adding the * maxnum to the expected. * @param proc Pointer to proc from where message came * @param hdr Pointer to header of message */ bool mca_pml_bfo_is_duplicate_msg(mca_pml_bfo_comm_proc_t* proc, mca_pml_bfo_match_hdr_t *hdr) { const int window = 32768; const int maxnum = 65536; mca_pml_bfo_recv_frag_t *frag; #if 0 opal_output(0, "checking dup, exp=%d, act=%d, type=%d, cant_match=%d\n", (uint16_t)proc->expected_sequence, hdr->hdr_seq, hdr->hdr_common.hdr_type, opal_list_get_size(&proc->frags_cant_match)); #endif /* Few cases near end of values where expected may equal 65535 and * an out of order shows up that may equal something like 1. */ if (OPAL_UNLIKELY((uint16_t)proc->expected_sequence > hdr->hdr_seq)) { if (((uint16_t)proc->expected_sequence - hdr->hdr_seq) < window) { opal_output_verbose(20, mca_pml_bfo_output, "%s:%d: frag duplicated, exp=%d, act=%d, type=%d\n", __FILE__, __LINE__, (uint16_t)proc->expected_sequence, hdr->hdr_seq, hdr->hdr_common.hdr_type); return true; } } else { /* This is the normal flow through this code. We also need to * use the maxnum to ensure that we handle cases where the * expected number has rolled over but then a duplicate message * shows up that is greater than it. */ if ((((uint16_t)proc->expected_sequence + maxnum) - hdr->hdr_seq) < window) { opal_output_verbose(20, mca_pml_bfo_output, "%s:%d: frag duplicated, exp=%d, act=%d, type=%d\n", __FILE__, __LINE__, (uint16_t)proc->expected_sequence, hdr->hdr_seq, hdr->hdr_common.hdr_type); return true; } } /* Need to explicitly check against any out of order fragments. Unfortunately, we * always have to do this since we can get a duplicate out of order fragment. */ if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) { for(frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match); frag != (mca_pml_bfo_recv_frag_t*)opal_list_get_end(&proc->frags_cant_match); frag = (mca_pml_bfo_recv_frag_t*)opal_list_get_next(frag)) { mca_pml_bfo_match_hdr_t* mhdr = &frag->hdr.hdr_match; if(mhdr->hdr_seq == hdr->hdr_seq) { opal_output_verbose(20, mca_pml_bfo_output, "%s:%d: frag duplicated on frags_cant_match list, seq=%d, type=%d\n", __FILE__, __LINE__, hdr->hdr_seq, hdr->hdr_common.hdr_type); return true; } } } return false; } /** * This function checks to see if we have received a duplicate FIN * message. This is done by first pulling the pointer of the request * that the FIN message is pointing to from the message. We then * check the various fields in the request to the fields in the header * and make sure they match. If they do not, then the request must * have been recycled already and this is a duplicate FIN message. We * have to do this check on every FIN message that we receive. */ bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descriptor_t* rdma, mca_btl_base_module_t* btl) { mca_pml_base_request_t* basereq; /* When running with failover enabled, need to ensure that this * is not a duplicate FIN message. */ if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) { /* The first check is to make sure the descriptor is pointing * to a valid request. The descriptor may be pointing to NULL * if it was freed and not reused yet. */ if (NULL == rdma->des_cbdata) { opal_output_verbose(20, mca_pml_bfo_output, "FIN: received: dropping because not pointing to valid descriptor " "PML=%d CTX=%d SRC=%d RQS=%d", hdr->hdr_fin.hdr_match.hdr_seq, hdr->hdr_fin.hdr_match.hdr_ctx, hdr->hdr_fin.hdr_match.hdr_src, hdr->hdr_fin.hdr_match.hdr_common.hdr_flags); return true; } basereq = (mca_pml_base_request_t*)rdma->des_cbdata; /* Now we know the descriptor is pointing to a non-null request. * Does it match what we expect? To make sure the receiver request * matches the FIN message, check the context number, source of the * message, and MPI sequence number. Then make sure that it also * matches the internal sequencing number of the requests. We need * to look at the type of request we are pointing at to figure out * what fields to access. */ if (basereq->req_type == MCA_PML_REQUEST_RECV) { mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)basereq; if ((hdr->hdr_fin.hdr_match.hdr_ctx != recvreq->req_recv.req_base.req_comm->c_contextid) || (hdr->hdr_fin.hdr_match.hdr_src != recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) || (hdr->hdr_fin.hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) { opal_output_verbose(5, mca_pml_bfo_output, "FIN: received on receiver: dropping because no match " "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " "RQS:exp=%d,act=%d, dst_req=%p", (uint16_t)recvreq->req_msgseq, hdr->hdr_fin.hdr_match.hdr_seq, recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_fin.hdr_match.hdr_ctx, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, hdr->hdr_fin.hdr_match.hdr_src, recvreq->req_restartseq, hdr->hdr_fin.hdr_match.hdr_common.hdr_flags, (void *)recvreq); return true; } if (hdr->hdr_fin.hdr_match.hdr_common.hdr_flags != recvreq->req_restartseq) { opal_output_verbose(5, mca_pml_bfo_output, "FIN: received on receiver: dropping because old " "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " "RQS:exp=%d,act=%d, dst_req=%p", (uint16_t)recvreq->req_msgseq, hdr->hdr_fin.hdr_match.hdr_seq, recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_fin.hdr_match.hdr_ctx, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, hdr->hdr_fin.hdr_match.hdr_src, recvreq->req_restartseq, hdr->hdr_fin.hdr_match.hdr_common.hdr_flags, (void *)recvreq); return true; } } else if (basereq->req_type == MCA_PML_REQUEST_SEND) { mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)basereq; if ((hdr->hdr_fin.hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || (hdr->hdr_fin.hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) || (hdr->hdr_fin.hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) { uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence; opal_output_verbose(5, mca_pml_bfo_output, "FIN: received on sender: dropping because no match " "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " "RQS:exp=%d,act=%d, dst_req=%p", seq, hdr->hdr_fin.hdr_match.hdr_seq, sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_fin.hdr_match.hdr_ctx, sendreq->req_send.req_base.req_peer, hdr->hdr_fin.hdr_match.hdr_src, sendreq->req_restartseq, hdr->hdr_fin.hdr_match.hdr_common.hdr_flags, (void *)sendreq); return true; } if (hdr->hdr_fin.hdr_match.hdr_common.hdr_flags != sendreq->req_restartseq) { uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence; opal_output_verbose(5, mca_pml_bfo_output, "FIN: received on sender: dropping because old " "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " "RQS:exp=%d,act=%d, dst_req=%p", seq, hdr->hdr_fin.hdr_match.hdr_seq, sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_fin.hdr_match.hdr_ctx, sendreq->req_send.req_base.req_peer, hdr->hdr_fin.hdr_match.hdr_src, sendreq->req_restartseq, hdr->hdr_fin.hdr_match.hdr_common.hdr_flags, (void *)sendreq); return true; } } else { /* We can get here if the descriptor has been reused, but * not as an RDMA descriptor. In that case, the callback * function has been set to something else. Clearly the * descriptor we are interested is gone, so just drop the * FIN message. */ opal_output_verbose(5, mca_pml_bfo_output, "FIN: received: dropping because descriptor has been reused " "PML=%d CTX=%d SRC=%d RQS=%d rdma->des_flags=%d", hdr->hdr_fin.hdr_match.hdr_seq, hdr->hdr_fin.hdr_match.hdr_ctx, hdr->hdr_fin.hdr_match.hdr_src, hdr->hdr_fin.hdr_match.hdr_common.hdr_flags, rdma->des_flags); return true; } } return false; } /** * Repost a FIN message if we get an error on the completion event. */ void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des) { /* In the error case, we will repost the FIN message. I had * considered restarting the request. The problem is that the * request may be already complete when we detect that a FIN * message got an error on its completion event. For example, with * the PUT protocol, if the RDMA writes succeed and all the data * has been sent, then the request is marked as complete and can be * freed. Therefore, an error on the FIN message has no request to * refer back to. So, we will just repost it. However, we are also * faced with the case where the FIN message has an error but it * actually makes it to the other side. In that case we are now * sending a FIN message to a non-existent request on the receiver * side. To handle that, we have added the match information to * the FIN message. That way, we can check on the receiving side * to ensure that it is pointing to a valid request. */ mca_pml_bfo_fin_hdr_t* hdr; mca_bml_base_endpoint_t* bml_endpoint; ompi_proc_t *proc; mca_bml_base_btl_t* bml_btl; proc = (ompi_proc_t*) des->des_cbdata; bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; hdr = (mca_pml_bfo_fin_hdr_t*)des->des_src->seg_addr.pval; opal_output_verbose(20, mca_pml_bfo_output, "REPOST: BFO_HDR_TYPE_FIN: seq=%d,myrank=%d,peer=%d,hdr->hdr_fail=%d,src=%d", hdr->hdr_match.hdr_seq, ORTE_PROC_MY_NAME->vpid, proc->proc_name.vpid, hdr->hdr_fail, hdr->hdr_match.hdr_src); bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); /* Reconstruct the fin for sending on the other BTL */ mca_pml_bfo_send_fin(proc, bml_btl, hdr->hdr_des, MCA_BTL_NO_ORDER, hdr->hdr_fail, hdr->hdr_match.hdr_seq, hdr->hdr_match.hdr_common.hdr_flags, hdr->hdr_match.hdr_ctx, hdr->hdr_match.hdr_src); return; } /** * This function is called when a RNDV or RGET is received with the * FLAGS_RESTART flag set. This means this message already has a * receive request already associated with it. */ mca_pml_bfo_recv_request_t* mca_pml_bfo_get_request(mca_pml_bfo_match_hdr_t *hdr) { mca_pml_bfo_recv_request_t *match = NULL; mca_pml_bfo_rendezvous_hdr_t * rhdr = (mca_pml_bfo_rendezvous_hdr_t *) hdr; match = (mca_pml_bfo_recv_request_t *) rhdr->hdr_dst_req.pval; /* Check to see if we have received a duplicate RNDV (or RGET). This can * occur because we got an error when we reposted the RNDV. Therefore, * we make sure that the request has not completed from underneath us * and been recycled. Secondly, make sure we are not getting it a * second time for the same request. */ if ((rhdr->hdr_match.hdr_ctx != match->req_recv.req_base.req_comm->c_contextid) || (rhdr->hdr_match.hdr_src != match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) || (rhdr->hdr_match.hdr_seq != (uint16_t)match->req_msgseq) || (rhdr->hdr_restartseq == match->req_restartseq)) { if (hdr->hdr_common.hdr_type == MCA_PML_BFO_HDR_TYPE_RNDV) { opal_output_verbose(20, mca_pml_bfo_output, "RNDV: received with RESTART flag: duplicate, dropping " "PML:exp=%d,act=%d RQS=%d, src_req=%p, dst_req=%p, peer=%d", match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq, match->remote_req_send.pval, (void *)match, match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); } else { opal_output_verbose(20, mca_pml_bfo_output, "RGET: received with RESTART flag: duplicate, dropping " "PML:exp=%d,act=%d RQS=%d, src_req=%p, dst_req=%p, peer=%d", match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq, match->remote_req_send.pval, (void *)match, match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); } return NULL; } mca_pml_bfo_recv_request_reset(match); if (hdr->hdr_common.hdr_type == MCA_PML_BFO_HDR_TYPE_RNDV) { opal_output_verbose(30, mca_pml_bfo_output, "RNDV: received with RESTART flag: restarting recv, " "PML:exp=%d,act=%d RQS(new)=%d, src_req=%p, dst_req=%p, peer=%d", match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq, match->remote_req_send.pval, (void *)match, match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); } else { opal_output_verbose(30, mca_pml_bfo_output, "RGET: received with RESTART flag: restarting recv, " "PML:exp=%d,act=%d RQS(new)=%d, src_req=%p, dst_req=%p, peer=%d", match->req_msgseq, rhdr->hdr_match.hdr_seq, match->req_restartseq, match->remote_req_send.pval, (void *)match, match->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); } return match; } /** * Callback for when a RNDVRESTARTNOTIFY message is received. A * RNDVRESTARTNOTIFY message is sent from the sender to the receiver * telling the receiver that the message is going to be started over. * The receiver first makes sure that the request being pointed to is * still valid. If it is not, that means the receiver must have * completed the request and therefore we need to send a NACK back to * the sender. The receiver then makes sure this is not a duplicate * message. If it is a duplicate, it will just drop it. Otherwise, * it will then send a RNDVRESTARTACK message if there are no * outstanding events on the receiver. Otherwise, it will just change * the state of the request and wait for another event to send the * RNDVRESTARTACK to the sender. */ void mca_pml_bfo_recv_frag_callback_rndvrestartnotify(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { mca_btl_base_segment_t* segments = des->des_dst; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_recv_request_t* recvreq; ompi_proc_t* ompi_proc; orte_process_name_t orte_proc; bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY); recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_restart.hdr_dst_req.pval; /* Check to see if the receive request is still valid. If the * request is recycled, that means the original request must have * completed and we therefore need to send a NACK back to the sender. * Note that when the request is gone, we need to pull some information * off the header so that we can figure out where to send the NACK * message back to. */ if ((hdr->hdr_match.hdr_ctx != recvreq->req_recv.req_base.req_comm->c_contextid) || (hdr->hdr_match.hdr_src != recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) || (hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) { orte_proc.jobid = hdr->hdr_restart.hdr_jobid; orte_proc.vpid = hdr->hdr_restart.hdr_vpid; ompi_proc = ompi_proc_find(&orte_proc); opal_output_verbose(20, mca_pml_bfo_output, "RNDVRESTARTNOTIFY: received: does not match request, sending NACK back " "PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d " "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d, hdr->hdr_jobid=%d, " "hdr->hdr_vpid=%d, ompi_proc->proc_hostname=%s", (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq, recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, hdr->hdr_match.hdr_src, recvreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, hdr->hdr_restart.hdr_jobid, hdr->hdr_restart.hdr_vpid, ompi_proc->proc_hostname); mca_pml_bfo_recv_request_rndvrestartnack(des, ompi_proc, false); return; } /* We know that we have the correct receive request. Make sure this is not * a duplicate RNDVRESTARTNOTIFY on this request. */ if (hdr->hdr_restart.hdr_restartseq == recvreq->req_restartseq) { opal_output_verbose(20, mca_pml_bfo_output, "RNDVRESTARTNOTIFY: received duplicate: dropping RNDVRESTARTNOTIFY " "message PML:req=%d,hdr=%d CTX:req=%d,hdr=%d SRC:req=%d,hdr=%d " "RQS:req=%d,hdr=%d src_req=%p, dst_req=%p, peer=%d", (uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq, recvreq->req_recv.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, hdr->hdr_match.hdr_src, recvreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); return; } /* Increment restart number. */ recvreq->req_restartseq++; recvreq->req_errstate |= RECVREQ_RNDVRESTART_RECVED; opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTNOTIFY: received: outstanding receive events=%d, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); if (0 == recvreq->req_events) { mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, OMPI_SUCCESS, btl); } return; } /** * Callback for when a RNDVRESTARTACK message is received. This * message is sent from the receiver to the sender to acknowledge * the receipt of the RNDVRESTARTNOTIFY message. At this point, * the sender can reset the send request and restart the message. */ void mca_pml_bfo_recv_frag_callback_rndvrestartack(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { mca_btl_base_segment_t* segments = des->des_dst; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_send_request_t* sendreq; bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK); sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval; /* Check to see if we have received a duplicate message. The * first three comparisons make sure that we are not looking at a * recycled request. The last check makes sure we are not getting * a duplicate message for this specific request. All of this is * needed because the receiver might get an error and repost the * RNDVRESTARTACK message, but the RNDVRESTARTACK was actually received. */ if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) || (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence) || (hdr->hdr_restart.hdr_restartseq != sendreq->req_restartseq)) { opal_output_verbose(20, mca_pml_bfo_output, "RNDVRESTARTACK: received: does not match request, dropping " "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d EXP:exp=%d,act=%d " "src_req=%p, dst_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq, sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src, sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); return; } sendreq->req_restart++; if (2 == sendreq->req_restart) { opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTACK: received: restarting send " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", hdr->hdr_match.hdr_seq, hdr->hdr_restart.hdr_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); mca_pml_bfo_send_request_restart(sendreq, false, 0); } else { opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTACK received: waiting for RNDVRESTARTNOTIFY completion " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", hdr->hdr_match.hdr_seq, hdr->hdr_restart.hdr_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); } return; } /** * Callback for when a RECVERRNOTIFY message is received. This message * is sent from the receiver to the sender and tells the sender that * the receiver has seen an error. This will trigger the sender * to start the request restart sequence. */ void mca_pml_bfo_recv_frag_callback_recverrnotify(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { mca_btl_base_segment_t* segments = des->des_dst; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_send_request_t* sendreq; bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY); sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval; /* First make sure that this message is pointing to a valid request. * This can be determined if the communicator context, the source of * the message, and the MPI sequence number all match. */ if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) || (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) { opal_output_verbose(20, mca_pml_bfo_output, "RECVERRNOTIFY: received: does not match request, dropping " "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d RQS:exp=%d,act=%d " "src_req=%p, dst_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq, sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src, sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); return; } /* If a good ACK was never received, then the first ACK received * might be a RECVERRNOTIFY message. In that case, the sendreq does not * have a valid req_recv pointer in it. Therefore, check for that * case and update the field in the sendreq if necessary. */ if (NULL == sendreq->req_recv.pval) { sendreq->req_recv = hdr->hdr_restart.hdr_dst_req; } /* Now check to see a restart needs to be issued. The request * sequence number in the header is compared against the current * request sequence number in the send request. If the header * sequence number is greater than or equal to the send request * number, then a rndvrestartnotify is issued. There are some cases * where a few extra rndvrestartnotifys are issued. That is OK as * it will all work itself out. The idea is to prevent many * restarts unnecessarily. This still allows multiple restarts to * happen. It could be that sometime later another error occurs * which initiates a restart. That is OK as it will have the new * sequence number and all is well. */ if (hdr->hdr_restart.hdr_restartseq >= sendreq->req_restartseq) { assert(sendreq->req_send.req_base.req_ompi.req_state == OMPI_REQUEST_ACTIVE); sendreq->req_error++; opal_output_verbose(30, mca_pml_bfo_output, "RECVERRNOTIFY: received: sendreq has error, outstanding events=%d, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); if (0 == sendreq->req_events) { mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, OMPI_SUCCESS, btl); } } else { opal_output_verbose(30, mca_pml_bfo_output, "RECVERRNOTIFY: received: error has already been noted, ignoring " "PML:exp=%d,act=%d RQS:exp=%d,act=%d src_req=%p, dst_req=%p, peer=%d", sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); } return; } /** * Callback for when a RNDVRESTARTNACK message is received. This message * is sent from the receiver to the sender and tells the sender that * the receiver has already completed the message and there is nothing * else to be done. The sender should then just make the send request * complete. */ void mca_pml_bfo_recv_frag_callback_rndvrestartnack(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { mca_btl_base_segment_t* segments = des->des_dst; mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval; mca_pml_bfo_send_request_t* sendreq; bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK); sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_restart.hdr_src_req.pval; /* Not convinced a RNDVRESTARTNACK that does not match a request can * happen, but have the check in here anyways for now */ if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) || (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence) || (hdr->hdr_restart.hdr_restartseq != sendreq->req_restartseq)) { opal_output_verbose(20, mca_pml_bfo_output, "RNDVRESTARTNACK: received: does not match request, dropping " "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d EXP:exp=%d,act=%d " "src_req=%p, dst_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, hdr->hdr_match.hdr_seq, sendreq->req_send.req_base.req_comm->c_contextid, hdr->hdr_match.hdr_ctx, sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src, sendreq->req_restartseq, hdr->hdr_restart.hdr_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); return; } opal_output_verbose(20, mca_pml_bfo_output, "RNDVRESTARTNACK: received: marking send request as complete " "PML=%d CTX=%d SRC=%d EXP=%d " "src_req=%p, dst_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_send.req_base.req_comm->c_contextid, sendreq->req_send.req_base.req_peer, sendreq->req_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); mca_pml_bfo_send_request_rndvrestartnack(sendreq); return; } /** * This function gets called when failover is enabled and an error * occurs during the rendezvous protocol. A message is sent to the * receiving side notifying the request that the communication is * going to be starting over. However, none of the information in the * send request is reset yet, so that any in flight fragments can * still find a home. Information in the send request gets reset when * the completion event for this send occurs AND an ACK has been * received back from the receiver. */ void mca_pml_bfo_send_request_rndvrestartnotify(mca_pml_bfo_send_request_t* sendreq, bool repost, mca_btl_base_tag_t tag, int status, mca_btl_base_module_t* btl) { mca_btl_base_descriptor_t* des; mca_pml_bfo_restart_hdr_t* restart; int rc; mca_bml_base_btl_t* bml_btl; ompi_proc_t* proc = (ompi_proc_t*)sendreq->req_send.req_base.req_proc; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; /* If this message is not a repost, then update the sequence number. */ if (!repost) { /* Bump up the rendezvous request sequence number. */ sendreq->req_restartseq++; } assert(0 == sendreq->req_events); assert(0 != bml_endpoint->btl_eager.arr_size); /* In the case that this is started because the receiver has * sent us a message, then attempt to use a different BTL than the * error message was received on. This may potentially tickle the * error sooner if this side has not seen it yet. */ bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); if (bml_btl->btl == btl) { /* If there is more than one BTL left, then we will get a * different one. If there is only one, we will just get * the same one back again. That is OK. */ bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); } /* allocate descriptor */ mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof(mca_pml_bfo_restart_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if( OPAL_UNLIKELY(NULL == des) ) { opal_output(0, "%s:%d Our of resources, cannot proceed", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } /* fill out header */ restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; restart->hdr_match.hdr_common.hdr_flags = 0; restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY; restart->hdr_match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; restart->hdr_match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; restart->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; restart->hdr_restartseq = sendreq->req_restartseq; restart->hdr_src_req.pval = sendreq; restart->hdr_dst_req = sendreq->req_recv; restart->hdr_dst_rank = sendreq->req_send.req_base.req_peer; /* Needed for NACKs */ restart->hdr_jobid = ORTE_PROC_MY_NAME->jobid; restart->hdr_vpid = ORTE_PROC_MY_NAME->vpid; bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, proc); /* initialize descriptor */ des->des_cbfunc = mca_pml_bfo_rndvrestartnotify_completion; opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTNOTIFY: sent: PML=%d, RQS(new)=%d, CTX=%d, SRC=%d, " "src_req=%p, dst_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, restart->hdr_match.hdr_ctx, restart->hdr_match.hdr_src, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY); if( OPAL_UNLIKELY( rc < 0 ) ) { opal_output(0, "[%s:%d] Cannot send rndvrestartnotify message", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } } /** * This function is called when a RNDVRESTARTNACK message is received * by the sender. */ void mca_pml_bfo_send_request_rndvrestartnack(mca_pml_bfo_send_request_t* sendreq) { /* A RNDVRESTARTNACK was sent by the receiver. This means that the * receiver is rejecting the RNDVRESTARTNOTIFY message indicating the * receiver's request is complete. Therefore, mark the sender complete * also. This data exchange is over. */ send_request_pml_complete(sendreq); } /** * This function restarts a RNDV send request. When this is called, * all the fields in the send request are reset and the send is * started over. The sendreq->req_restartseq will be non-zero which will * trigger a special flag in the RNDV header which indicates the match * has already happened on the receiving side. */ void mca_pml_bfo_send_request_restart(mca_pml_bfo_send_request_t* sendreq, bool repost, mca_btl_base_tag_t tag) { size_t offset = 0; opal_list_item_t *first_item; opal_list_item_t *last_item; mca_bml_base_endpoint_t* endpoint; size_t i; /* If the tag is something valid, it was a repost. We could also * check the repost field as well. Maybe I can drop the * repost and have the tag double as it. */ switch (tag) { case MCA_PML_BFO_HDR_TYPE_RNDV: opal_output_verbose(30, mca_pml_bfo_output, "RNDV: completion failed, reset and repost: PML=%d, RQS=%d, " "CTX=%d, SRC=%d, src_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq, sendreq->req_send.req_base.req_peer); break; case MCA_PML_BFO_HDR_TYPE_RGET: opal_output_verbose(30, mca_pml_bfo_output, "RGET: completion failed, reset and repost: PML=%d, RQS=%d, " "CTX=%d, SRC=%d, src_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq, sendreq->req_send.req_base.req_peer); break; default: break; } /* Return mpool resources, they get reacquired when request starts over. */ mca_pml_bfo_free_rdma_resources(sendreq); /* Release any memory in use if this is a buffered send */ if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && sendreq->req_send.req_addr != sendreq->req_send.req_base.req_addr) { mca_pml_base_bsend_request_fini((ompi_request_t*)sendreq); } /* Clear out any unsent send ranges. Recreated the get_send_range * and the get_next_send_range functions. */ OPAL_THREAD_LOCK(&sendreq->req_send_range_lock); first_item = opal_list_get_first(&sendreq->req_send_ranges); last_item = opal_list_get_end(&sendreq->req_send_ranges); while (first_item != last_item) { opal_list_remove_item(&sendreq->req_send_ranges, last_item); OMPI_FREE_LIST_RETURN(&mca_pml_bfo.send_ranges, (ompi_free_list_item_t *)last_item); last_item = opal_list_get_end(&sendreq->req_send_ranges); } OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock); /* Reset the converter to the beginning. */ opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, &offset); /* Bump up internal sequence number to handle possible duplicate * RNDV messages. In the case of reposting a RNDV message, do not * increment the value. That way, a duplicate message can be * detected. */ if (!repost) { sendreq->req_restartseq++; } /* This code here is essentially the same is mca_pml_bfo_send_request_start() * but with a few modifications since we are restarting the request, not * starting entirely from scratch. */ endpoint = (mca_bml_base_endpoint_t*)sendreq->req_send.req_base.req_proc->proc_bml; sendreq->req_endpoint = endpoint; sendreq->req_state = 0; sendreq->req_lock = 0; sendreq->req_pipeline_depth = 0; sendreq->req_bytes_delivered = 0; sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE; /* Note that we do not reset the following three items. * They stay with their original values. * sendreq->req_send.req_base.req_sequence * sendreq->req_restartseq * sendreq->req_recv.pval */ sendreq->req_restart = 0; /* reset in case we restart again */ sendreq->req_error = 0; /* clear error state */ sendreq->req_events = 0; /* clear events, probably 0 anyways */ MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base ); for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { mca_bml_base_btl_t* bml_btl; int rc; /* select a btl */ bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); if( OPAL_LIKELY(OMPI_ERR_OUT_OF_RESOURCE != rc) ) return; } add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); } /** * This function will repost a match fragment. This function has to * handle the case where there may not be a request associated with * the fragment and just use the information in the fragment to * repost the send. */ void mca_pml_bfo_repost_match_fragment(struct mca_btl_base_descriptor_t* des) { mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; struct mca_bml_base_endpoint_t* endpoint; int rc; size_t offset = 0; /* At this point a determination has to be made whether the * BFO_HDR_TYPE_MATCH fragment was sent via the sendi interface or * via the regular send interface. This is important because if it * was sent via the sendi interface, then the request associated * with it has already been completed and released. This can be * determined by looking at the des->des_flags field of the * descriptor. If the ALWAYS_CALLBACK flag is set then it is known * that there is a valid send request associated with the fragment * and it can be used to extricate information. If ALWAYS_CALLBACK * is not set, then the endpoint information is in the callback * data field and where to resend the fragment can be determined * from the fragment. */ if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { endpoint = sendreq->req_endpoint; opal_output_verbose(30, mca_pml_bfo_output, "MATCH: repost: src_req=%p", (void *)sendreq); } else { endpoint = des->des_cbdata; opal_output_verbose(30, mca_pml_bfo_output, "MATCH: repost: des=%p (sendi fragment)", (void *)des); } assert(0 != endpoint->btl_eager.arr_size); bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { /* Reset the converter to the beginning if the message is * not a zero-length message. In the case of zero-length * message, the convertor is not being used. */ if (0 != sendreq->req_send.req_bytes_packed) { opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, &offset); } rc = mca_pml_bfo_send_request_start_btl(sendreq, bml_btl); if (OMPI_SUCCESS == rc) { return; } else if (OMPI_ERR_OUT_OF_RESOURCE == rc) { opal_output_verbose(30, mca_pml_bfo_output, "Warning: delaying reposting of BFO_HDR_TYPE_MATCH, btls=%d", (int)sendreq->req_endpoint->btl_eager.arr_size); add_request_to_send_pending(sendreq, MCA_PML_BFO_SEND_PENDING_START, true); return; } else { opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } } else { /* No send request available so alloc and repost explicitly */ mca_btl_base_descriptor_t* newdes = NULL; mca_btl_base_segment_t* oldseg; mca_btl_base_segment_t* newseg; oldseg = des->des_src; /* The alloc routine must be called with the MCA_BTL_NO_ORDER * flag so that the allocation routine works. The allocation * will fill in the order flag in the descriptor. */ mca_bml_base_alloc( bml_btl, &newdes, MCA_BTL_NO_ORDER, oldseg->seg_len, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); if (OPAL_UNLIKELY(NULL == newdes)) { opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } newseg = newdes->des_src; /* Copy over all the data that is actually sent over the wire */ memcpy(newseg->seg_addr.pval, oldseg->seg_addr.pval, oldseg->seg_len); newseg->seg_len = oldseg->seg_len; /* This call will either return OMPI_SUCCESS or OMPI_ERROR. The * OMPI_SUCCESS only says that the send request can be freed. * It may be that the message was queued up in the BTL. */ rc = mca_bml_base_send(bml_btl, newdes, MCA_PML_BFO_HDR_TYPE_MATCH); /* Some BTLs will set the CALLBACK flag but we do not want that * as there is no longer a request associated with this descriptor. * Therefore, always make sure it is cleared. */ newdes->des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK; if( OPAL_LIKELY( rc >= 0 )) { /* Just let the normal flow of data free whatever needs * to be freed */ return; } else { opal_output(0, "%s:%d FATAL ERROR, cannot repost BFO_HDR_TYPE_MATCH", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } } /* No need to free any descriptors. The BTLs take care of it since * we originally allocated with MCA_BTL_DES_FLAGS_BTL_OWNERSHIP. */ } /** * Completion callback for rndvrestartnotify completion event. If the * RNDVRESTARTACK has already been received, then reset and restart. * Otherwise, just update the state and let the RNDVRESTARTACK trigger * the reset and restart. */ void mca_pml_bfo_rndvrestartnotify_completion(mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, struct mca_btl_base_descriptor_t* des, int status) { mca_pml_bfo_restart_hdr_t* restart; mca_pml_bfo_send_request_t* sendreq; restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; sendreq = (mca_pml_bfo_send_request_t*) restart->hdr_src_req.pval; /* Need to resend this message in the case that it fails */ if( OPAL_UNLIKELY((OMPI_SUCCESS != status))) { opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTNOTIFY: completion failed: repost " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); /* Repost the message and indicate it is a repost, not a new one. No need * to check the req_events as this is the only possible outstanding send * event when we have posted this message. We also know the sendreq is still * available because nothing can proceed until this completion event happens * successfully as we track the req_restart value. */ mca_pml_bfo_send_request_rndvrestartnotify(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, status, btl); return; } /* The req_restart value is incremented to indicate completion of * the RNDVRESTARTNOTIFY message. Then (typically) the arrival of the * ACK message will cause the request to reset and restart. Need to * make sure that RNDVRESTARTNOTIFY callback has been called as well as * the ACK back from the receiver prior to resetting and restarting * the request. This is needed in case we get an error on the * RNDVRESTARTNOTIFY message, but it actually makes it over. We want * to make sure the send request has not restarted yet. So, keep a * counter that counts to 2. */ sendreq->req_restart++; if (2 == sendreq->req_restart) { opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTNOTIFY: completion: restarting request " "PML=%d, RQS=%d, CTX=%d, src_req=%p, dst_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, sendreq->req_recv.pval, (void *)sendreq, sendreq->req_send.req_base.req_peer); mca_pml_bfo_send_request_restart(sendreq, false, 0); } else { opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTNOTIFY: completion: waiting for ack " "PML=%d, RQS=%d, CTX=%d, src_req=%p, dst_req=%p, peer=%d", (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, sendreq->req_recv.pval, (void *)sendreq, sendreq->req_send.req_base.req_peer); } } /** * This function is called when an error is detected on a completion * event on the receiving side. This can come from a ACK, PUT, RDMA * read (GET) or RECVERRNOTIFY completion event. When this happens, check * the state of the request and decide if the sender needs be notified * that a problem was seen. If no RECVERRNOTIFY message has been sent and * no RNDVRESTARTNOTIFY has been received from the sender, then send a * message telling the sender an error was seen. */ void mca_pml_bfo_recv_request_recverrnotify(mca_pml_bfo_recv_request_t* recvreq, mca_btl_base_tag_t tag, int status) { mca_btl_base_descriptor_t* des; mca_pml_bfo_restart_hdr_t* restart; ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; mca_bml_base_btl_t* bml_btl; int rc; assert(0 != bml_endpoint->btl_eager.arr_size); bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); /* allocate descriptor */ mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof(mca_pml_bfo_restart_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if( OPAL_UNLIKELY(NULL == des) ) { opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } /* fill out header */ restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; restart->hdr_match.hdr_common.hdr_flags = 0; restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY; restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid; restart->hdr_match.hdr_src = recvreq->req_recv.req_base.req_comm->c_my_rank; restart->hdr_match.hdr_seq = (uint16_t)recvreq->req_msgseq; restart->hdr_restartseq = recvreq->req_restartseq; restart->hdr_src_req = recvreq->remote_req_send; restart->hdr_dst_req.pval = recvreq; bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, proc); /* initialize descriptor */ des->des_cbfunc = mca_pml_bfo_recv_restart_completion; opal_output_verbose(30, mca_pml_bfo_output, "RECVERRNOTIFY: sending to sender, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d, btl=%p", recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, (void *)bml_btl->btl); rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY); if( OPAL_UNLIKELY( rc < 0 ) ) { opal_output(0, "[%s:%d] Cannot send recverrnotify message", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } /* Prevent future error messages on this request */ recvreq->req_errstate |= RECVREQ_RECVERRSENT; } /** * This function is called when it may be time to send a RNDVRESTARTACK * message back to the sending side. This can happen because we * received a RNDVRESTARTNOTIFY message from the sender. This can * also happen if we have noticed that the request has received the * RNDVRESTARTNOTIFY message, but has not yet sent out the RNDVRESTARTACK * because there were still some pending receive events on the request. * That means we can enter this routine from a completion event on a ACK, * PUT, or RDMA read as well as from the receipt of a RNDVRESTARTNOTIFY * message. If all is good, we sent the RNDVRESTARTACK message back to * the sender. Then sometime later a message will arrive telling us * to reset and restart the receive request. */ void mca_pml_bfo_recv_request_rndvrestartack(mca_pml_bfo_recv_request_t* recvreq, mca_btl_base_tag_t tag, int status, mca_btl_base_module_t* btl) { mca_btl_base_descriptor_t* des; mca_pml_bfo_restart_hdr_t* restart; ompi_proc_t* proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; mca_bml_base_btl_t* bml_btl; int rc; assert((recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) == RECVREQ_RNDVRESTART_RECVED); assert((recvreq->req_errstate & RECVREQ_RNDVRESTART_ACKED) == 0); assert(0 != bml_endpoint->btl_eager.arr_size); bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); /* Attempt to use a different BTL than the error message was * received on. This may potentially tickle the error sooner if * this side has not seen it yet. */ if (bml_btl->btl == btl) { /* If there is more than one BTL left, then we will get a * different one. If there is only one, we will just get * the same one back again. That is OK. */ bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); } /* allocate descriptor */ mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof(mca_pml_bfo_restart_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if( OPAL_UNLIKELY(NULL == des) ) { opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } /* fill out header */ restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; restart->hdr_match.hdr_common.hdr_flags = 0; restart->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK; restart->hdr_match.hdr_ctx = recvreq->req_recv.req_base.req_comm->c_contextid; restart->hdr_match.hdr_src = recvreq->req_recv.req_base.req_comm->c_my_rank; restart->hdr_match.hdr_seq = (uint16_t)recvreq->req_msgseq; restart->hdr_restartseq = recvreq->req_restartseq; restart->hdr_src_req = recvreq->remote_req_send; restart->hdr_dst_req.pval = recvreq; bfo_hdr_hton(restart, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK, proc); /* initialize descriptor */ des->des_cbfunc = mca_pml_bfo_recv_restart_completion; des->des_cbdata = (void *)proc; opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTACK: due to PML tag=%d completion, sending to " "sender, PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, " "peer=%d, btl=%p", tag, recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, status, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, (void *)bml_btl->btl); rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK); if( OPAL_UNLIKELY( rc < 0 ) ) { opal_output(0, "[%s:%d] Cannot send rndvrestartack message", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } /* Move to the next state so we do not send anymore ACKs */ recvreq->req_errstate &= ~RECVREQ_RNDVRESTART_RECVED; recvreq->req_errstate |= RECVREQ_RNDVRESTART_ACKED; } /** * Called after the receipt of a RNDVRESTARTNOTIFY message to a request * that no longer matches. This can happen if the sender detected an * error, but the receiver actually received all the data. Therefore * send a NACK back instead of the ACK so that the sender can complete * its request. This happens very rarely. Note that we need to make * use of the hdr_dst_rank that we received from the notify message. * This is so the sending side make sure the message matches a valid * request on the sending side. */ void mca_pml_bfo_recv_request_rndvrestartnack(mca_btl_base_descriptor_t* olddes, ompi_proc_t* ompi_proc, bool repost) { mca_btl_base_segment_t* segments; mca_pml_bfo_restart_hdr_t* hdr; /* hdr of NOTIFY message */ mca_pml_bfo_restart_hdr_t* nack; /* hdr of NACK message */ mca_btl_base_descriptor_t* des; mca_bml_base_endpoint_t* bml_endpoint; mca_bml_base_btl_t* bml_btl; int rc; if (repost) { /* In the case where we are reposting the NACK, the information * is in the src area, since we are reposting a send. In addition, * we get the ompi_proc from the old descriptor. */ segments = olddes->des_src; ompi_proc = olddes->des_cbdata; } else { segments = olddes->des_dst; } hdr = (mca_pml_bfo_restart_hdr_t*)segments->seg_addr.pval; bml_endpoint = ompi_proc->proc_bml; assert(0 != bml_endpoint->btl_eager.arr_size); bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); /* allocate descriptor */ mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof(mca_pml_bfo_restart_hdr_t), MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if( OPAL_UNLIKELY(NULL == des) ) { opal_output(0, "%s:%d Out of resources, cannot proceed", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } /* fill out header */ nack = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; nack->hdr_match.hdr_common.hdr_flags = 0; nack->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK; nack->hdr_match.hdr_ctx = hdr->hdr_match.hdr_ctx; nack->hdr_match.hdr_src = hdr->hdr_dst_rank; /* Receiver rank */ nack->hdr_match.hdr_seq = hdr->hdr_match.hdr_seq; nack->hdr_restartseq = hdr->hdr_restartseq; nack->hdr_src_req = hdr->hdr_src_req; nack->hdr_dst_req.pval = 0; bfo_hdr_hton(nack, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK, ompi_proc); /* Initialize descriptor. Save away ompi_proc in case we need * to respost this fragmnet. */ des->des_cbfunc = mca_pml_bfo_recv_restart_completion; des->des_cbdata = ompi_proc; opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTNACK: sending to sender, " "PML=%d, RQS=%d, CTX=%d, SRC=%d, peer=%d", nack->hdr_match.hdr_seq, nack->hdr_restartseq, nack->hdr_match.hdr_ctx, nack->hdr_match.hdr_src, ompi_proc->proc_name.vpid); rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK); if( OPAL_UNLIKELY( rc < 0 ) ) { opal_output(0, "[%s:%d] Cannot send rndvrestartnack message", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } } /** * Reset all the receive request fields to match what a request * looks like when it is first started. This gets called when * the rendezvous/rget message is being restarted. */ void mca_pml_bfo_recv_request_reset(mca_pml_bfo_recv_request_t* match) { int i; assert(true != match->req_recv.req_base.req_pml_complete); /* Free up any resources that were reserved for this receive. This * was copied from the receive completion code. */ for(i = 0; i < (int)match->req_rdma_cnt; i++) { mca_mpool_base_registration_t* btl_reg = match->req_rdma[i].btl_reg; if( NULL != btl_reg && btl_reg->mpool != NULL) { btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg ); } } match->req_rdma_cnt = 0; /* This code is mostly copied from mca_pml_bfo_recv_req_start. * Note 1: Leave req_bytes_expected as the original value. No * need to adjust this as it is set when convertor is created. * Note 2: Leave req_bytes_delivered as the original value. * This is created when the convertor is created and represents * the expected bytes from the user. */ assert(0 == match->req_events); match->req_errstate = 0; match->req_lock = 0; match->req_pipeline_depth = 0; match->req_bytes_received = 0; match->req_rdma_idx = 0; match->req_rdma_offset = 0; match->req_send_offset = 0; match->req_pending = false; match->req_ack_sent = false; match->req_restartseq++; /* These really should not need to be set, but this matches some * of the initialization within MCA_PML_BASE_RECV_START. */ match->req_recv.req_base.req_pml_complete = false; match->req_recv.req_base.req_ompi.req_complete = false; match->req_recv.req_base.req_ompi.req_state = OMPI_REQUEST_ACTIVE; /* Reset the convertor */ opal_convertor_set_position(&match->req_recv.req_base.req_convertor, &match->req_rdma_offset); return; } /* * Completion callback for RNDVRESTARTACK, RNDVRESTARTNACK and RECVERRNOTIFY. */ void mca_pml_bfo_recv_restart_completion( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, struct mca_btl_base_descriptor_t* des, int status ) { if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) { mca_pml_bfo_common_hdr_t* common = des->des_src->seg_addr.pval; mca_pml_bfo_restart_hdr_t* restart; /* RESTART header */ mca_pml_bfo_recv_request_t* recvreq; switch (common->hdr_type) { case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK: restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval; opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTACK: completion failed: try again " "PML:req=%d,hdr=%d RQS:req=%d,hdr=%d CTX:req=%d,hdr=%d " "src_req=%p, dst_req=%p, peer=%d", recvreq->req_msgseq, restart->hdr_match.hdr_seq, recvreq->req_restartseq, restart->hdr_restartseq, recvreq->req_recv.req_base.req_comm->c_contextid, restart->hdr_match.hdr_ctx, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); /* Adjust the states back to avoid assert errors */ recvreq->req_errstate &= ~RECVREQ_RNDVRESTART_ACKED; recvreq->req_errstate |= RECVREQ_RNDVRESTART_RECVED; mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK, status, btl); break; case MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK: opal_output_verbose(30, mca_pml_bfo_output, "RNDVRESTARTNACK: completion failed: try again " "des=%p ", (void *)des); /* Just blast it again. No request associated with it. */ mca_pml_bfo_recv_request_rndvrestartnack(des, NULL, true); break; case MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY: restart = (mca_pml_bfo_restart_hdr_t*)des->des_src->seg_addr.pval; recvreq = (mca_pml_bfo_recv_request_t*) restart->hdr_dst_req.pval; /* With just two BTLs, this should never happen as we are * typically sending the RECVERRNOTIFY message on the * working BTL. But, just in case, if we get an error, * send it again. */ opal_output_verbose(30, mca_pml_bfo_output, "RECVERRNOTIFY: completion failed: try again, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, status); break; default: opal_output(0, "[%s:%d] Unknown callback error", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } } } /* * Remove a btl for future communication on an endpoint. */ void mca_pml_bfo_map_out_btl(struct mca_btl_base_module_t* btl, ompi_proc_t *errproc, char *btlname) { mca_bml_base_endpoint_t* ep; bool remove = false; int i; ep = (mca_bml_base_endpoint_t*)errproc->proc_bml; /* The bml_del_proc_btl function does not indicate if it * actually removed a btl, so let me check up front. This is * done so that we can only print out messages when a btl is * actually going to be removed. These arrays are small so it * is OK to walk through all of them even though it may be * redundant. */ for( i = 0; i < (int)ep->btl_eager.arr_size; i++ ) { if( ep->btl_eager.bml_btls[i].btl == btl ) { remove = true; } } for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) { if( ep->btl_send.bml_btls[i].btl == btl ) { remove = true; } } for( i = 0; i < (int)ep->btl_rdma.arr_size; i++ ) { if( ep->btl_rdma.bml_btls[i].btl == btl ) { remove = true; } } if (true == remove) { mca_bml.bml_del_proc_btl(errproc, btl); orte_notifier.log(ORTE_NOTIFIER_ERROR, ORTE_ERR_COMM_FAILURE, "BTL %s error: rank=%d mapping out %s " "to rank=%d on node=%s", btl->btl_component->btl_version.mca_component_name, ORTE_PROC_MY_NAME->vpid, btlname, errproc->proc_name.vpid, errproc->proc_hostname); opal_output_verbose(10, mca_pml_bfo_output, "BTL %s error: rank=%d mapping out %s " "to rank=%d on node=%s \n", btl->btl_component->btl_version.mca_component_name, ORTE_PROC_MY_NAME->vpid, btlname, errproc->proc_name.vpid, errproc->proc_hostname); /* Need to search for any pending packets associated * with this endpoint and remove them. We may also * have to restarts depending on the state of the * requests. */ mca_pml_bfo_error_pending_packets(btl, ep); if ((ep->btl_eager.arr_size == 0) && (ep->btl_send.arr_size == 0) && (ep->btl_rdma.arr_size == 0)) { opal_output(0, "%s:%d: No more interfaces, aborting", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } } } void mca_pml_bfo_failover_error_handler(struct mca_btl_base_module_t* btl, int32_t flags, ompi_proc_t *errproc, char *btlname) { ompi_proc_t** procs; size_t p, num_procs; /* If we are in here, we know that the we were called * with the flags == MCA_BTL_ERROR_FLAGS_NONFATAL so no * need to check it in here. */ assert(flags & MCA_BTL_ERROR_FLAGS_NONFATAL); procs = ompi_proc_all(&num_procs); if(NULL == procs) { opal_output(0, "%s:%d: Out of memory, giving up.", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } if (NULL == btlname) { btlname = "unknown"; } /* If the process to map out is not specified then map out the * entire BTL. Otherwise, only map out the BTL for the specific * remote process. */ if (NULL == errproc) { for( p = 0; p < num_procs; p++ ) { mca_pml_bfo_map_out_btl(btl, procs[p], btlname); } } else { mca_pml_bfo_map_out_btl(btl, errproc, btlname); } free(procs); } /** * This function is called since when we are mapping out a BML. This * will walk through the four PML lists and dispatch with the * fragments/requests. There are four different lists and each one is * handled slighty differently. In all cases, we first see if the * message is associated with the endpoint that is being mapped out. * If not, then just leave it alone and put it back on the list. If * it is associated with the endpoint, then a each list handles it * slighlty differently. Also, in some cases, we actually adjust the * pointers to the BMLs in the messages as they may have changed when * the BML is mapped out. That is because this is called after we * have mapped out the offending BML and adjusted the array of * available BMLs. */ static void mca_pml_bfo_error_pending_packets(mca_btl_base_module_t* btl, mca_bml_base_endpoint_t* ep) { int32_t i, s; /* The pckt_pending list contains both ACK and FIN messages. * ACKs can be sent over any BTL associated with the endpoint. * Therefore, the bml_btl entry for ACKS is NULL and they do * not need to be adjusted. It is also worth noting that * the ACK will be the only outstanding message associated * with a request so we can just let nature takes it course. * * FIN messages do have a BML associated with them, but they * can also be sent over any BTL. Therefore, adjust the bml * pointer in the pckt to ensure it points at a valid BML. */ s = (int32_t)opal_list_get_size(&mca_pml_bfo.pckt_pending); for(i = 0; i < s; i++) { mca_pml_bfo_pckt_pending_t *pckt; opal_output_verbose(0, mca_pml_bfo_output, "INFO: pckt_pending list has %d entries", s); #if 1 /* TODO: Error out until code is tested */ opal_output_verbose(0, mca_pml_bfo_output, "%s:%d: Support not implemented, aborting", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); #endif OPAL_THREAD_LOCK(&mca_pml_bfo.lock); pckt = (mca_pml_bfo_pckt_pending_t*) opal_list_remove_first(&mca_pml_bfo.pckt_pending); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); /* My guess is that this can happen in the threaded * case where the other thread removed some packets * after we determined the size of the list. */ if(NULL == pckt) break; /* If there is no bml stored on the packet, then just * put it back on the list as there is nothing to adjust. * This appears to be true with ACK packets. */ if (NULL == pckt->bml_btl) { OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.pckt_pending, (opal_list_item_t*)pckt); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); continue; } /* Now see if this endpoint matches the one we are mapping * out. If so, adjust the bml entry so to ensure it is * not pointing at a stale bml. We do not really care * which BML it is pointing at as long as it is valid. * In either case, then put entry back on the list. */ if (pckt->proc->proc_bml == ep) { opal_output_verbose(15, mca_pml_bfo_output, "INFO: Found matching pckt on pckt_pending list, adjusting bml"); pckt->bml_btl = mca_bml_base_btl_array_get_next(&ep->btl_eager); } OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.pckt_pending, (opal_list_item_t*)pckt); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); } /* This next list holds rdma fragments. We need to walk through * the list and see if any are associated with the endpoint * we are mapping out. If not, then just put back on the * list. If they are, then we need to error them out. One issue * is that we need to deal with the case where there may be more * then one pending rdma fragment for a request. */ s = (int32_t)opal_list_get_size(&mca_pml_bfo.rdma_pending); for(i = 0; i < s; i++) { mca_pml_bfo_rdma_frag_t* frag; mca_pml_bfo_send_request_t* sendreq; mca_pml_bfo_recv_request_t* recvreq; opal_output_verbose(0, mca_pml_bfo_output, "INFO: rdma_pending list has %d entries", s); #if 1 /* TODO: Error out until code is tested */ opal_output_verbose(0, mca_pml_bfo_output, "%s:%d: Support not implemented, aborting", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); #endif OPAL_THREAD_LOCK(&mca_pml_bfo.lock); frag = (mca_pml_bfo_rdma_frag_t*) opal_list_remove_first(&mca_pml_bfo.rdma_pending); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); /* My guess is that this can happen in the threaded * case where the other thread removed some packets * after we determined the size of the list. */ if(NULL == frag) break; /* Check to see if it matches our endpoint. If it does, * then check if it matches the BTL that is being mapped * out. If it does not, then just readjust the BML pointer. * If it does, then we need to do something with it. */ if (frag->rdma_ep != ep) { OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); continue; } /* If we are here, then we know we are working on the same * endpoint. Now check the BTL. */ if (frag->rdma_btl != btl) { opal_output_verbose(15, mca_pml_bfo_output, "INFO: Found matching frag on rdma_pending list, adjusting bml"); /* The BTL this RDMA is associated with is not the * one that is getting mapped out, so just adjust the * BML pointer and put back on the list. */ frag->rdma_bml = mca_bml_base_btl_array_find(&ep->btl_rdma, frag->rdma_btl); OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.rdma_pending, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); continue; } /* Now we call the restart routine. This is just like if we got * a completion event after calling an RDMA write. This will * take care of figuring out if we need to restart the request * or wait for any outstanding events to complete. */ if(frag->rdma_state == MCA_PML_BFO_RDMA_PUT) { opal_output_verbose(15, mca_pml_bfo_output, "INFO: Found matching PUT frag on rdma_pending list, restarting"); sendreq = frag->rdma_req; mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, MCA_PML_BFO_HDR_TYPE_PUT, 2, btl); MCA_PML_BFO_RDMA_FRAG_RETURN(frag); } else { opal_output_verbose(15, mca_pml_bfo_output, "INFO: Found matching RGET frag on rdma_pending list, sending reqerror"); /* This is just like what we do on an rget completion event */ recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req; mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, 2); /* See if the request has received a RNDVRESTARTNOTIFY */ if( OPAL_UNLIKELY(recvreq->req_errstate)) { if (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED) { mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, 2, btl); } } MCA_PML_BFO_RDMA_FRAG_RETURN(frag); } } s = opal_list_get_size(&mca_pml_bfo.send_pending); /* Look for pending events on our endpoint */ for(i = 0; i < s; i++) { mca_pml_bfo_send_request_t* sendreq; ompi_proc_t* proc; mca_bml_base_endpoint_t* bml_endpoint; opal_output_verbose(0, mca_pml_bfo_output, "INFO: send_pending list has %d entries", s); #if 1 /* TODO: Error out until code is tested */ opal_output_verbose(0, mca_pml_bfo_output, "%s:%d: Support not implemented, aborting", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); #endif OPAL_THREAD_LOCK(&mca_pml_bfo.lock); sendreq = (mca_pml_bfo_send_request_t*) opal_list_remove_first(&mca_pml_bfo.send_pending); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); /* My guess is that this can happen in the threaded * case where the other thread removed some packets * after we determined the size of the list. */ if(NULL == sendreq) break; proc = (ompi_proc_t*)sendreq->req_send.req_base.req_proc; bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; /* Check to see if it matches our endpoint. If it does not, * then just put it back on the list as there is nothing * we need to do with it. */ if (bml_endpoint != ep) { OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.send_pending, (opal_list_item_t*)sendreq); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); continue; } switch(sendreq->req_pending) { case MCA_PML_BFO_SEND_PENDING_SCHEDULE: /* If this send request is using the endpoint that received * the error, then let us error it out. In the case * where there is only one fragment left to be scheduled * and it would have gone over the good BTL, this is * not necessary. But, we will use simplicity here * and assume that some of the fragments are still * scheduled to go over the broken BTL. */ sendreq->req_error++; mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, MCA_PML_BFO_HDR_TYPE_FRAG, 2, btl); break; case MCA_PML_BFO_SEND_PENDING_START: /* If the request has not even started, then just put it back * on the list. Nothing else to do with it. */ OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.send_pending, (opal_list_item_t*)sendreq); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); break; default: opal_output(0, "[%s:%d] wrong send request type\n", __FILE__, __LINE__); break; } } s = (int)opal_list_get_size(&mca_pml_bfo.recv_pending); for(i = 0; i < s; i++) { mca_pml_bfo_recv_request_t* recvreq; ompi_proc_t* proc; mca_bml_base_endpoint_t* bml_endpoint; opal_output_verbose(0, mca_pml_bfo_output, "INFO: recv_pending list has %d entries", s); #if 1 /* TODO: Error out until code is tested */ opal_output_verbose(0, mca_pml_bfo_output, "%s:%d: Support not implemented, aborting", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); #endif OPAL_THREAD_LOCK(&mca_pml_bfo.lock); recvreq = (mca_pml_bfo_recv_request_t*) opal_list_remove_first(&mca_pml_bfo.recv_pending); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); /* My guess is that this can happen in the threaded * case where the other thread removed some packets * after we determined the size of the list. */ if(NULL == recvreq) break; proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; if (bml_endpoint != ep) { OPAL_THREAD_LOCK(&mca_pml_bfo.lock); opal_list_append(&mca_pml_bfo.recv_pending, (opal_list_item_t*)recvreq); OPAL_THREAD_UNLOCK(&mca_pml_bfo.lock); continue; } mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, 2); } } /** * Call each time we get a completion event on ACK or PUT message. * These types of messages are receive control type messages. This * function is only called if the underlying BTL supports failover. * Otherwise, there is no need for this check. */ void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl, struct mca_btl_base_descriptor_t* des, int status) { mca_pml_bfo_common_hdr_t * common = des->des_src->seg_addr.pval; mca_pml_bfo_ack_hdr_t* ack; /* ACK header */ mca_pml_bfo_rdma_hdr_t* hdr; /* PUT header */ struct mca_btl_base_descriptor_t* rdma_des; mca_pml_bfo_recv_request_t* recvreq; if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) { switch (common->hdr_type) { case MCA_PML_BFO_HDR_TYPE_ACK: ack = (mca_pml_bfo_ack_hdr_t*)des->des_src->seg_addr.pval; recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval; /* Record the error. Send RECVERRNOTIFY if necessary. */ if (recvreq->req_errstate) { opal_output_verbose(30, mca_pml_bfo_output, "ACK: completion failed, error already seen, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); } else { opal_output_verbose(30, mca_pml_bfo_output, "ACK: completion failed, sending RECVERRNOTIFY to sender, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_ACK, status); } break; case MCA_PML_BFO_HDR_TYPE_PUT: hdr = (mca_pml_bfo_rdma_hdr_t*)des->des_src->seg_addr.pval; rdma_des = hdr->hdr_des.pval; recvreq = des->des_cbdata; if ((NULL != rdma_des->des_cbdata) && (recvreq == rdma_des->des_cbdata)) { /* We now record the error, send the RECVERRNOTIFY if * necessary, and free the descriptor. Prior to this, * we want to ensure that we have not reached the case * where the PUT message actually made it over and we * have already received a FIN back. We first check to * see if the RDMA descriptor cbdata is pointing to * NULL. If it is, this means that the PUT message must * have made it over and a corresponding FIN already * made it back and freed the RDMA descriptor. Second, * if it is non-null, we make sure that it is pointing * to the same request as the PUT descriptor is. If * it is not, again we assume that the FIN came back * and freed it. And we can count on the fact that the * recvreq has not been freed or reused as it is held * until this very completion event occurs. */ if (recvreq->req_errstate) { opal_output_verbose(30, mca_pml_bfo_output, "PUT: completion failed, error already seen, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); } else { opal_output_verbose(30, mca_pml_bfo_output, "PUT: completion failed, sending RECVERRNOTIFY to sender, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, status); } #if 0 /* TODO: Add descriptor to receive request so it can * be freed only when receive request is freed and * only if needed. */ btl->btl_free(btl, rdma_des); #endif } break; default: orte_errmgr.abort(-1, NULL); } } switch (common->hdr_type) { case MCA_PML_BFO_HDR_TYPE_ACK: ack = (mca_pml_bfo_ack_hdr_t*)des->des_src->seg_addr.pval; recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval; recvreq->req_events--; assert(recvreq->req_events >= 0); if(OPAL_UNLIKELY (recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED)) { opal_output_verbose(30, mca_pml_bfo_output, "ACK: completion: recvreq in error, outstanding events=%d " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, status, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); if (0 == recvreq->req_events) { mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_ACK, status, btl); } return; } recv_request_pml_complete_check(recvreq); break; case MCA_PML_BFO_HDR_TYPE_PUT: recvreq = des->des_cbdata; recvreq->req_events--; assert(recvreq->req_events >= 0); if(OPAL_UNLIKELY(recvreq->req_errstate & RECVREQ_RNDVRESTART_RECVED)) { opal_output_verbose(30, mca_pml_bfo_output, "PUT: completion: recvreq in error, outstanding events=%d " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", recvreq->req_events, recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, status, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); if (0 == recvreq->req_events) { mca_pml_bfo_recv_request_rndvrestartack(recvreq, MCA_PML_BFO_HDR_TYPE_PUT, status, btl); } return; } recv_request_pml_complete_check(recvreq); break; } } /** * Register four functions to handle extra PML message types that * are utilized when a failover occurs. */ int mca_pml_bfo_register_callbacks(void) { int rc; /* The following four functions are utilized when failover * support for openib is enabled. */ rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY, mca_pml_bfo_recv_frag_callback_rndvrestartnotify, NULL ); if(OMPI_SUCCESS != rc) return rc; rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK, mca_pml_bfo_recv_frag_callback_rndvrestartack, NULL ); if(OMPI_SUCCESS != rc) return rc; rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK, mca_pml_bfo_recv_frag_callback_rndvrestartnack, NULL ); if(OMPI_SUCCESS != rc) return rc; rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY, mca_pml_bfo_recv_frag_callback_recverrnotify, NULL ); if(OMPI_SUCCESS != rc) return rc; return rc; } /** * Update a few fields when we are restarting either a RNDV or * RGET type message. */ void mca_pml_bfo_update_rndv_fields(mca_pml_bfo_hdr_t* hdr, mca_pml_bfo_send_request_t* sendreq, char *type) { hdr->hdr_common.hdr_flags |= MCA_PML_BFO_HDR_FLAGS_RESTART; hdr->hdr_rndv.hdr_dst_req = sendreq->req_recv; hdr->hdr_rndv.hdr_restartseq = sendreq->req_restartseq; opal_output_verbose(30, mca_pml_bfo_output, "%s: restarting: PML=%d, RQS=%d, CTX=%d, SRC=%d, " "src_req=%p, dst_req=%p, peer=%d", type, (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, sendreq->req_send.req_base.req_comm->c_my_rank, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); } /** * The following set of functions are all called when it is determined * that the cached bml_btl->btl does not match the btl handed back * by the callback function. This means that the bml_btl array has * been shuffled and the bml_btl matching the btl has to be found * back. If it cannot be found, then just find a different one to * use. */ void mca_pml_bfo_update_eager_bml_btl_recv_ctl(mca_bml_base_btl_t** bml_btl, mca_btl_base_module_t* btl, struct mca_btl_base_descriptor_t* des) { if ((*bml_btl)->btl != btl) { mca_pml_bfo_common_hdr_t * common = des->des_src->seg_addr.pval; mca_pml_bfo_ack_hdr_t* ack; /* ACK header */ mca_pml_bfo_recv_request_t* recvreq; char *type; switch (common->hdr_type) { case MCA_PML_BFO_HDR_TYPE_ACK: ack = (mca_pml_bfo_ack_hdr_t*)des->des_src->seg_addr.pval; recvreq = (mca_pml_bfo_recv_request_t*) ack->hdr_dst_req.pval; type = "ACK"; break; case MCA_PML_BFO_HDR_TYPE_PUT: recvreq = des->des_cbdata; type = "PUT"; break; } mca_pml_bfo_find_recvreq_eager_bml_btl(bml_btl, btl, recvreq, type); } } void mca_pml_bfo_find_sendreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl, mca_btl_base_module_t* btl, mca_pml_bfo_send_request_t* sendreq, char* type) { if ((*bml_btl)->btl != btl) { opal_output_verbose(25, mca_pml_bfo_output, "%s completion: BML does not match BTL, find it back, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", type, (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); *bml_btl = mca_bml_base_btl_array_find(&sendreq->req_endpoint->btl_eager, btl); if (NULL == *bml_btl) { opal_output_verbose(25, mca_pml_bfo_output, "%s completion: BML is gone, find another one, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", type, (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); *bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->btl_eager); } } } void mca_pml_bfo_find_sendreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl, mca_btl_base_module_t* btl, mca_pml_bfo_send_request_t* sendreq, char* type) { if ((*bml_btl)->btl != btl) { opal_output_verbose(25, mca_pml_bfo_output, "%s completion: BML does not match BTL, find it back, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", type, (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); *bml_btl = mca_bml_base_btl_array_find(&sendreq->req_endpoint->btl_rdma, btl); if (NULL == *bml_btl) { opal_output_verbose(25, mca_pml_bfo_output, "%s completion: BML is gone, find another one, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", type, (uint16_t)sendreq->req_send.req_base.req_sequence, sendreq->req_restartseq, (void *)sendreq, sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); *bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->btl_rdma); } } } void mca_pml_bfo_find_recvreq_eager_bml_btl(mca_bml_base_btl_t** bml_btl, mca_btl_base_module_t* btl, mca_pml_bfo_recv_request_t* recvreq, char* type) { if ((*bml_btl)->btl != btl) { ompi_proc_t *proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; opal_output_verbose(25, mca_pml_bfo_output, "%s completion: BML does not match BTL, find it back, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", type, recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); *bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, btl); if (NULL == *bml_btl) { opal_output_verbose(25, mca_pml_bfo_output, "%s completion: BML is gone, find another one, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", type, recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_eager); } } } void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl, mca_btl_base_module_t* btl, mca_pml_bfo_recv_request_t* recvreq, char* type) { if ((*bml_btl)->btl != btl) { ompi_proc_t *proc = (ompi_proc_t*)recvreq->req_recv.req_base.req_proc; mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; opal_output_verbose(25, mca_pml_bfo_output, "%s completion: BML does not match BTL, find it back, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", type, recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); *bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); if (NULL == *bml_btl) { opal_output_verbose(25, mca_pml_bfo_output, "%s completion: BML is gone, find another one, " "PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", type, recvreq->req_msgseq, recvreq->req_restartseq, recvreq->remote_req_send.pval, (void *)recvreq, recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma); } } }