diff --git a/ompi/mca/pml/bfo/pml_bfo.c b/ompi/mca/pml/bfo/pml_bfo.c index 1c9911bfa4..9fd4ef5843 100644 --- a/ompi/mca/pml/bfo/pml_bfo.c +++ b/ompi/mca/pml/bfo/pml_bfo.c @@ -48,9 +48,9 @@ #include "pml_bfo_sendreq.h" #include "pml_bfo_recvreq.h" #include "pml_bfo_rdmafrag.h" -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO #include "pml_bfo_failover.h" -/* BFO FAILOVER CODE - end */ +#endif mca_pml_bfo_t mca_pml_bfo = { { @@ -409,13 +409,12 @@ int mca_pml_bfo_add_procs(ompi_proc_t** procs, size_t nprocs) NULL ); if(OMPI_SUCCESS != rc) goto cleanup_and_return; - -/* BFO FAILOVER CODE - begin */ + +#ifdef PML_BFO rc = mca_pml_bfo_register_callbacks(); if(OMPI_SUCCESS != rc) goto cleanup_and_return; -/* BFO FAILOVER CODE - end */ - +#endif /* register error handlers */ rc = mca_bml.bml_register_error(mca_pml_bfo_error_handler); if(OMPI_SUCCESS != rc) @@ -472,14 +471,13 @@ static void mca_pml_bfo_fin_completion( mca_btl_base_module_t* btl, mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { mca_pml_bfo_repost_fin(des); return; } MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des); -/* BFO FAILOVER CODE - end */ - +#endif /* check for pending requests */ MCA_PML_BFO_PROGRESS_PENDING(bml_btl); } @@ -494,12 +492,14 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, ompi_ptr_t hdr_des, uint8_t order, +#ifdef PML_BFO uint32_t status, -/* BFO FAILOVER CODE - begin */ uint16_t seq, uint8_t restartseq, uint16_t ctx, uint32_t src) -/* BFO FAILOVER CODE - end */ +#else + uint32_t status ) +#endif { mca_btl_base_descriptor_t* fin; mca_pml_bfo_fin_hdr_t* hdr; @@ -521,13 +521,13 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc, hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN; hdr->hdr_des = hdr_des; hdr->hdr_fail = status; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO fin->des_cbdata = proc; hdr->hdr_match.hdr_seq = seq; hdr->hdr_match.hdr_ctx = ctx; hdr->hdr_match.hdr_src = src; hdr->hdr_match.hdr_common.hdr_flags = restartseq; /* use unused hdr_flags field */ -/* BFO FAILOVER CODE - end */ +#endif bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FIN, proc); @@ -594,11 +594,15 @@ void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl) rc = mca_pml_bfo_send_fin(pckt->proc, send_dst, pckt->hdr.hdr_fin.hdr_des, pckt->order, +#ifdef PML_BFO pckt->hdr.hdr_fin.hdr_fail, pckt->hdr.hdr_fin.hdr_match.hdr_seq, pckt->hdr.hdr_fin.hdr_match.hdr_common.hdr_flags, pckt->hdr.hdr_fin.hdr_match.hdr_ctx, pckt->hdr.hdr_fin.hdr_match.hdr_src); +#else + pckt->hdr.hdr_fin.hdr_fail); +#endif if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { return; } @@ -640,14 +644,13 @@ void mca_pml_bfo_process_pending_rdma(void) void mca_pml_bfo_error_handler( struct mca_btl_base_module_t* btl, int32_t flags, ompi_proc_t* errproc, char* btlinfo ) { -/* BFO FAILOVER CODE - begin */ - /* If we get a non-fatal error, try to failover */ +#ifdef PML_BFO if (flags & MCA_BTL_ERROR_FLAGS_NONFATAL) { mca_pml_bfo_failover_error_handler(btl, flags, errproc, btlinfo); -/* BFO FAILOVER CODE - end */ - } else { - orte_errmgr.abort(-1, NULL); + return; } +#endif + orte_errmgr.abort(-1, NULL); } #if OPAL_ENABLE_FT_CR == 0 diff --git a/ompi/mca/pml/bfo/pml_bfo.h b/ompi/mca/pml/bfo/pml_bfo.h index 0011c60dc2..3592c8c4c3 100644 --- a/ompi/mca/pml/bfo/pml_bfo.h +++ b/ompi/mca/pml/bfo/pml_bfo.h @@ -23,6 +23,7 @@ #ifndef MCA_PML_BFO_H #define MCA_PML_BFO_H +#define PML_BFO 1 #include "ompi_config.h" #include "ompi/class/ompi_free_list.h" #include "ompi/request/request.h" @@ -225,8 +226,12 @@ do { \ int mca_pml_bfo_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, +#ifdef PML_BFO ompi_ptr_t hdr_des, uint8_t order, uint32_t status, uint16_t seq, uint8_t reqseq, uint16_t ctx, uint32_t src); +#else + ompi_ptr_t hdr_des, uint8_t order, uint32_t status); +#endif /* This function tries to resend FIN/ACK packets from pckt_pending queue. * Packets are added to the queue when sending of FIN or ACK is failed due to diff --git a/ompi/mca/pml/bfo/pml_bfo_failover.c b/ompi/mca/pml/bfo/pml_bfo_failover.c index 59f5115e38..aa500fd06c 100644 --- a/ompi/mca/pml/bfo/pml_bfo_failover.c +++ b/ompi/mca/pml/bfo/pml_bfo_failover.c @@ -2103,3 +2103,102 @@ void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl, } } } + +/** + * The completion event for the RNDV message has returned with an + * error. We know that the send request we are looking at is valid + * because it cannot be completed until the sendreq->req_state value + * reaches 0. And for the sendreq->req_state to reach 0, the + * completion event on the RNDV message must occur. So, we do not + * bother checking whether the send request is valid, because we know + * it is, but we put a few asserts in for good measure. We then check + * a few fields in the request to decide what to do. If the + * sendreq->req_error is set, that means that something has happend + * already to the request and we do not want to restart it. + * Presumably, we may have received a RECVERRNOTIFY message from the + * receiver. We also check the sendreq->req_acked field to see if it + * has been acked. If it has, then again we do not restart everything + * because obviously the RNDV message has made it to the other side. + */ +bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des, + mca_pml_bfo_send_request_t* sendreq) +{ + assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_ctx == + (sendreq)->req_send.req_base.req_comm->c_contextid); + assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_src == + (sendreq)->req_send.req_base.req_comm->c_my_rank); + assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_seq == + (uint16_t)(sendreq)->req_send.req_base.req_sequence); + if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) { + (sendreq)->req_events--; + /* Assume RNDV did not make it, so restart from the beginning. */ + mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDV); + return true; + } + return false; +} + +/** + * Check to see if an error has occurred on this send request. If it has + * and there are no outstanding events, then we can start the restart dance. + */ +void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendreq, + int status, + mca_btl_base_module_t* btl, + int type, + char *description) +{ + opal_output_verbose(30, mca_pml_bfo_output, + "%s: completion: sendreq has error, outstanding events=%d, " + "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", + description, + sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, (void *)sendreq, + sendreq->req_recv.pval, + status, sendreq->req_send.req_base.req_peer); + if (0 == sendreq->req_events) { + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, + type, status, btl); + } +} + +/* If we get an error on the RGET message, then first make sure that + * header matches the send request that we are pointing to. This is + * necessary, because even though the sending side got an error, the + * RGET may have made it to the receiving side and the message transfer + * may have completed. This would then mean the send request has been + * completed and perhaps in use by another communication. So there is + * no need to restart this request. Therefore, ensure that we are + * looking at the same request that the header thinks we are looking + * at. If not, then there is nothing else to be done. */ +void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des) +{ + mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; + mca_pml_bfo_hdr_t* hdr = des->des_src->seg_addr.pval; + switch (hdr->hdr_common.hdr_type) { + case MCA_PML_BFO_HDR_TYPE_RGET: + if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || + (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_comm->c_my_rank) || + (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) { + opal_output_verbose(30, mca_pml_bfo_output, + "RGET: completion event: dropping because no valid request " + "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " + "RQS:exp=%d,act=%d, dst_req=%p", + (uint16_t)sendreq->req_send.req_base.req_sequence, + hdr->hdr_match.hdr_seq, + sendreq->req_send.req_base.req_comm->c_contextid, + hdr->hdr_match.hdr_ctx, + sendreq->req_send.req_base.req_comm->c_my_rank, + hdr->hdr_match.hdr_src, + sendreq->req_restartseq, hdr->hdr_rndv.hdr_restartseq, + (void *)sendreq); + return; + } + mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RGET); + return; + default: + opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)", + __FILE__, __LINE__, hdr->hdr_common.hdr_type); + orte_errmgr.abort(-1, NULL); + } +} diff --git a/ompi/mca/pml/bfo/pml_bfo_failover.h b/ompi/mca/pml/bfo/pml_bfo_failover.h index 084b69e428..364dc75a94 100644 --- a/ompi/mca/pml/bfo/pml_bfo_failover.h +++ b/ompi/mca/pml/bfo/pml_bfo_failover.h @@ -105,6 +105,16 @@ void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl, mca_pml_bfo_recv_request_t* recvreq, char* type); +bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des, + mca_pml_bfo_send_request_t* sendreq); +void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des); + + +void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendreq, + int status, + mca_btl_base_module_t* btl, + int type, + char *description); /** * Four new callbacks for the four new message types. */ @@ -243,57 +253,33 @@ extern void mca_pml_bfo_recv_frag_callback_recverrnotify( mca_btl_base_module_t * Macros for pml_bfo_sendreq.c file. */ -/* The completion event for the RNDV message has returned with an - * error. We know that the send request we are looking at is valid - * because it cannot be completed until the sendreq->req_state value - * reaches 0. And for the sendreq->req_state to reach 0, the - * completion event on the RNDV message must occur. So, we do not - * bother checking whether the send request is valid, because we know - * it is, but we put a few asserts in for good measure. We then check - * a few fields in the request to decide what to do. If the - * sendreq->req_error is set, that means that something has happend - * already to the request and we do not want to restart it. - * Presumably, we may have received a RECVERRNOTIFY message from the - * receiver. We also check the sendreq->req_acked field to see if it - * has been acked. If it has, then again we do not restart everything - * because obviously the RNDV message has made it to the other side. */ -#define MCA_PML_BFO_ERROR_ON_RNDV_COMPLETION(sendreq, des) \ -do { \ - assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_ctx == \ - (sendreq)->req_send.req_base.req_comm->c_contextid); \ - assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_src == \ - (sendreq)->req_send.req_base.req_comm->c_my_rank); \ - assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_seq == \ - (uint16_t)(sendreq)->req_send.req_base.req_sequence); \ - if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) { \ - (sendreq)->req_events--; \ - /* Assume RNDV did not make it, so restart from the beginning. */ \ - mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDV); \ - return; \ - } \ -} while (0) - /* Now check the error state. This request can be in error if the * RNDV message made it over, but the receiver got an error trying to * send the ACK back and therefore sent a RECVERRNOTIFY message. In * that case, we want to start the restart dance as the receiver has * matched this message already. Only restart if there are no * outstanding events on send request. */ -#define MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RNDV_COMPLETION(sendreq, status, btl) \ - if ((sendreq)->req_error) { \ - opal_output_verbose(30, mca_pml_bfo_output, \ - "RNDV: completion: sendreq has error, outstanding events=%d, " \ - "PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", \ - (sendreq)->req_events, \ - (uint16_t)(sendreq)->req_send.req_base.req_sequence, \ - (sendreq)->req_restartseq, (unsigned long)(sendreq), \ - (unsigned long)(sendreq)->req_recv.pval, \ - status, (sendreq)->req_send.req_base.req_peer); \ - if (0 == (sendreq)->req_events) { \ - mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \ - MCA_PML_BFO_HDR_TYPE_RNDV, \ - status, btl); \ - } \ +#define MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \ + if( OPAL_UNLIKELY ((sendreq)->req_error)) { \ + mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \ + btl, type, description); \ + return; \ + } + +/** + * This macro is called within the frag completion function in two + * places. It is called to see if any errors occur prior to the + * completion event on the frag. It is then called a second time + * after the scheduling routine is called as the scheduling routine + * may have detected that a BTL that was cached on the request had + * been removed and therefore marked the request in error. In that + * case, the scheduling of fragments can no longer proceed properly, + * and if there are no outstanding events, iniated the restart dance. + */ +#define MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \ + if( OPAL_UNLIKELY((sendreq)->req_error)) { \ + mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \ + btl, type, description); \ return; \ } @@ -302,7 +288,7 @@ do { * field is not checked here. That is because that is the value * returned in the FIN hdr.hdr_fail field and may be used for other * things. */ -#define MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RGET_COMPLETION(sendreq, btl, des) \ +#define MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des) \ if( OPAL_UNLIKELY(sendreq->req_error)) { \ opal_output_verbose(30, mca_pml_bfo_output, \ "FIN: received on broken request, skipping, " \ @@ -314,66 +300,15 @@ do { return; \ } -/* If we get an error on the RGET message, then first make sure that - * header matches the send request that we are pointing to. This is - * necessary, because even though the sending side got an error, the - * RGET may have made it to the receiving side and the message transfer - * may have completed. This would then mean the send request has been - * completed and perhaps in use by another communication. So there is - * no need to restart this request. Therefore, ensure that we are - * looking at the same request that the header thinks we are looking - * at. If not, then there is nothing else to be done. */ -#define MCA_PML_BFO_ERROR_ON_SEND_CTL_COMPLETION(sendreq, des) \ -do { \ - mca_pml_bfo_hdr_t* hdr = des->des_src->seg_addr.pval; \ - mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; \ - switch (hdr->hdr_common.hdr_type) { \ - case MCA_PML_BFO_HDR_TYPE_RGET: \ - if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || \ - (hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_comm->c_my_rank) || \ - (hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) { \ - opal_output_verbose(30, mca_pml_bfo_output, \ - "RGET: completion event: dropping because no valid request " \ - "PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " \ - "RQS:exp=%d,act=%d, dst_req=%p", \ - (uint16_t)sendreq->req_send.req_base.req_sequence, \ - hdr->hdr_match.hdr_seq, \ - sendreq->req_send.req_base.req_comm->c_contextid, \ - hdr->hdr_match.hdr_ctx, \ - sendreq->req_send.req_base.req_comm->c_my_rank, \ - hdr->hdr_match.hdr_src, \ - sendreq->req_restartseq, hdr->hdr_rndv.hdr_restartseq, \ - (void *)sendreq); \ - return; \ - } \ - mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RGET); \ - return; \ - default: \ - opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)", \ - __FILE__, __LINE__, hdr->hdr_common.hdr_type); \ - orte_errmgr.abort(-1, NULL); \ - } \ -} while (0) /* Check if there has been an error on the send request when we get * a completion event on the RDMA write. */ -#define MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_PUT_COMPLETION(sendreq, status, btl) \ - if ( OPAL_UNLIKELY(sendreq->req_error)) { \ - opal_output_verbose(30, mca_pml_bfo_output, \ - "RDMA write: completion: sendreq has error, outstanding events=%d," \ - " PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", \ - sendreq->req_events, \ - (uint16_t)sendreq->req_send.req_base.req_sequence, \ - sendreq->req_restartseq, (void *)sendreq, \ - sendreq->req_recv.pval, \ - status, sendreq->req_send.req_base.req_peer); \ - if (0 == sendreq->req_events) { \ - mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \ - MCA_PML_BFO_HDR_TYPE_PUT, \ - status, btl); \ - } \ - MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \ - return; \ +#define MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl) \ + if ( OPAL_UNLIKELY(sendreq->req_error)) { \ + mca_pml_bfo_completion_sendreq_has_error(sendreq, status, btl, \ + MCA_PML_BFO_HDR_TYPE_PUT, "RDMA write"); \ + MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \ + return; \ } #define MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, type) \ @@ -416,6 +351,28 @@ do { mca_pml_bfo_update_eager_bml_btl_recv_ctl(&bml_btl, btl, des); \ } +#define MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl) \ + if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { \ + opal_output_verbose(30, mca_pml_bfo_output, \ + "PUT received: no matching BTL to RDMA write to, oustanding " \ + "events=%d, PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \ + sendreq->req_events, \ + (uint16_t)sendreq->req_send.req_base.req_sequence, \ + sendreq->req_restartseq, (void *)sendreq, \ + sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); \ + MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \ + sendreq->req_error++; \ + if (0 == sendreq->req_events) { \ + mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \ + MCA_PML_BFO_HDR_TYPE_PUT, \ + OMPI_ERROR, btl); \ + } \ + return; \ + } + + + + END_C_DECLS #endif diff --git a/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h index 1e7923bf60..e1aaaa3003 100644 --- a/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h +++ b/ompi/mca/pml/bfo/pml_bfo_rdmafrag.h @@ -36,7 +36,9 @@ typedef enum { struct mca_pml_bfo_rdma_frag_t { ompi_free_list_item_t super; mca_bml_base_btl_t* rdma_bml; +#ifdef PML_BFO mca_btl_base_module_t* rdma_btl; +#endif mca_pml_bfo_hdr_t rdma_hdr; mca_pml_bfo_rdma_state_t rdma_state; size_t rdma_length; diff --git a/ompi/mca/pml/bfo/pml_bfo_recvfrag.c b/ompi/mca/pml/bfo/pml_bfo_recvfrag.c index 94e7b8be64..071064682d 100644 --- a/ompi/mca/pml/bfo/pml_bfo_recvfrag.c +++ b/ompi/mca/pml/bfo/pml_bfo_recvfrag.c @@ -41,9 +41,9 @@ #include "pml_bfo_recvreq.h" #include "pml_bfo_sendreq.h" #include "pml_bfo_hdr.h" -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO #include "pml_bfo_failover.h" -/* BFO FAILOVER CODE - end */ +#endif OBJ_CLASS_INSTANCE( mca_pml_bfo_buffer_t, ompi_free_list_item_t, @@ -242,13 +242,13 @@ void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl, slow_path: OPAL_THREAD_UNLOCK(&comm->matching_lock); -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO /* Check for duplicate messages. If message is duplicate, then just * return as that essentially drops the message. */ if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) { return; } -/* BFO FAILOVER CODE - end */ +#endif mca_pml_bfo_recv_frag_match(btl, hdr, segments, num_segments, MCA_PML_BFO_HDR_TYPE_MATCH); } @@ -306,9 +306,9 @@ void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl, bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_ACK); sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_ack.hdr_src_req.pval; sendreq->req_recv = hdr->hdr_ack.hdr_dst_req; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq) -/* BFO FAILOVER CODE - end */ +#endif /* if the request should be delivered entirely by copy in/out * then throttle sends */ @@ -352,9 +352,9 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl, } bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG); recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq) -/* BFO FAILOVER CODE - end */ +#endif mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt); return; @@ -375,9 +375,9 @@ void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl, bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_PUT); sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_rdma.hdr_req.pval; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq) -/* BFO FAILOVER CODE - end */ +#endif mca_pml_bfo_send_request_put(sendreq,btl,&hdr->hdr_rdma); return; @@ -398,11 +398,11 @@ void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl, bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FIN); rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if (true == mca_pml_bfo_is_duplicate_fin(hdr, rdma, btl)) { return; } -/* BFO FAILOVER CODE - end */ +#endif rdma->des_cbfunc(btl, NULL, rdma, hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS); @@ -626,7 +626,7 @@ static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl, * the fragment. */ OPAL_THREAD_LOCK(&comm->matching_lock); -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO /* In case of network failover, we may get a message telling us to * restart. In that case, we already have a pointer to the receive * request in the header itself. */ @@ -635,8 +635,8 @@ static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl, if (NULL == match) { return OMPI_SUCCESS; } -/* BFO FAILOVER CODE - end */ } else { +#endif /* get sequence number of next message that can be processed */ next_msg_seq_expected = (uint16_t)proc->expected_sequence; @@ -673,8 +673,9 @@ out_of_order_match: /* release matching lock before processing fragment */ OPAL_THREAD_UNLOCK(&comm->matching_lock); +#ifdef PML_BFO } - +#endif if(OPAL_LIKELY(match)) { switch(type) { case MCA_PML_BFO_HDR_TYPE_MATCH: @@ -716,14 +717,13 @@ wrong_seq: * This message comes after the next expected, so it * is ahead of sequence. Save it for later. */ -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO /* Check for duplicate messages. If message is duplicate, then just * return as that essentially drops the message. */ if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) { return OMPI_SUCCESS; } -/* BFO FAILOVER CODE - end */ - +#endif append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments, num_segments, NULL); OPAL_THREAD_UNLOCK(&comm->matching_lock); diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.c b/ompi/mca/pml/bfo/pml_bfo_recvreq.c index 0b34539877..66093a8bc9 100644 --- a/ompi/mca/pml/bfo/pml_bfo_recvreq.c +++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.c @@ -33,9 +33,9 @@ #include "orte/mca/errmgr/errmgr.h" #include "opal/util/arch.h" #include "ompi/memchecker.h" -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO #include "pml_bfo_failover.h" -/* BFO FAILOVER CODE - end */ +#endif void mca_pml_bfo_recv_request_process_pending(void) { @@ -170,13 +170,13 @@ static void mca_pml_bfo_recv_ctl_completion( mca_btl_base_module_t* btl, { mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) { mca_pml_bfo_check_recv_ctl_completion_status(btl, des, status); } MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL_RECV_CTL(bml_btl, btl, des); -/* BFO FAILOVER CODE - end */ +#endif MCA_PML_BFO_PROGRESS_PENDING(bml_btl); } @@ -199,12 +199,13 @@ static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl, } OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); +#ifdef PML_BFO btl->btl_free(btl, des); - -/* BFO FAILOVER CODE - begin */ MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq); MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL(bml_btl, btl, recvreq, "PUT"); -/* BFO FAILOVER CODE - end */ +#else + mca_bml_base_free(bml_btl, des); +#endif /* check completion status */ OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); @@ -249,18 +250,18 @@ int mca_pml_bfo_recv_request_ack_send_btl( /* initialize descriptor */ des->des_cbfunc = mca_pml_bfo_recv_ctl_completion; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO des->des_cbdata = (void *)proc; -/* BFO FAILOVER CODE - end */ +#endif rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_ACK); if( OPAL_LIKELY( rc >= 0 ) ) { -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) && (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { ((mca_pml_bfo_recv_request_t *)hdr_dst_req)->req_events++; } -/* BFO FAILOVER CODE - end */ +#endif return OMPI_SUCCESS; } mca_bml_base_free(bml_btl, des); @@ -346,29 +347,39 @@ static void mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl, mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata; mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) { recvreq->req_events--; assert(recvreq->req_events >= 0); } -/* BFO FAILOVER CODE - end */ +#endif /* check completion status */ if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { +#ifdef PML_BFO MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq); +#else + /* TSW - FIX */ + ORTE_ERROR_LOG(status); + orte_errmgr.abort(-1, NULL); +#endif } -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO MCA_PML_BFO_SECOND_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq, status, btl); MCA_PML_BFO_CHECK_RECVREQ_RDMA_BML_BTL(bml_btl, btl, recvreq, "RDMA write"); -/* BFO FAILOVER CODE - end */ +#endif mca_pml_bfo_send_fin(recvreq->req_recv.req_base.req_proc, bml_btl, frag->rdma_hdr.hdr_rget.hdr_des, - des->order, 0, (uint16_t)recvreq->req_msgseq, recvreq->req_restartseq, - recvreq->req_recv.req_base.req_comm->c_contextid, - recvreq->req_recv.req_base.req_comm->c_my_rank); +#ifdef PML_BFO + des->order, 0, (uint16_t)recvreq->req_msgseq, recvreq->req_restartseq, + recvreq->req_recv.req_base.req_comm->c_contextid, + recvreq->req_recv.req_base.req_comm->c_my_rank); +#else + des->order, 0); +#endif /* is receive request complete */ OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); @@ -432,12 +443,12 @@ int mca_pml_bfo_recv_request_get_frag( mca_pml_bfo_rdma_frag_t* frag ) orte_errmgr.abort(-1, NULL); } } -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) && (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { recvreq->req_events++; } -/* BFO FAILOVER CODE - end */ +#endif return OMPI_SUCCESS; } @@ -519,9 +530,9 @@ void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq 0, bytes_received ); recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req; -/* BFO FAILOVER CODE - end */ +#endif MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match); /* if receive buffer is not contiguous we can't just RDMA read into it, so @@ -556,9 +567,9 @@ void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq size += hdr->hdr_segs[i].seg_len; } } -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO frag->rdma_btl = btl; -/* BFO FAILOVER CODE - end */ +#endif frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__); @@ -828,9 +839,9 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq, continue; } ctl->des_cbfunc = mca_pml_bfo_recv_ctl_completion; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO ctl->des_cbdata = recvreq; -/* BFO FAILOVER CODE - end */ +#endif /* fill in rdma header */ hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_src->seg_addr.pval; @@ -838,9 +849,9 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq, hdr->hdr_common.hdr_flags = (!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0; hdr->hdr_req = recvreq->remote_req_send; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO hdr->hdr_dst_req.pval = recvreq; /* only needed in the first put message */ -/* BFO FAILOVER CODE - end */ +#endif hdr->hdr_des.pval = dst; hdr->hdr_rdma_offset = recvreq->req_rdma_offset; hdr->hdr_seg_cnt = dst->des_dst_cnt; @@ -862,12 +873,12 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq, /* send rdma request to peer */ rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_BFO_HDR_TYPE_PUT); if( OPAL_LIKELY( rc >= 0 ) ) { -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if ((btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) && (ctl->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { recvreq->req_events++; } -/* BFO FAILOVER CODE - end */ +#endif /* update request state */ recvreq->req_rdma_offset += size; OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); @@ -993,12 +1004,12 @@ void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req) req->req_bytes_received = 0; req->req_bytes_expected = 0; /* What about req_rdma_cnt ? */ -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO req->req_rdma_cnt = 0; req->req_events = 0; req->req_restartseq = 0; req->req_errstate = 0; -/* BFO FAILOVER CODE - end */ +#endif req->req_rdma_idx = 0; req->req_pending = false; req->req_ack_sent = false; diff --git a/ompi/mca/pml/bfo/pml_bfo_recvreq.h b/ompi/mca/pml/bfo/pml_bfo_recvreq.h index aaab2c03a8..08a0391fb9 100644 --- a/ompi/mca/pml/bfo/pml_bfo_recvreq.h +++ b/ompi/mca/pml/bfo/pml_bfo_recvreq.h @@ -30,24 +30,23 @@ #include "ompi/mca/pml/bfo/pml_bfo_comm.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/pml/base/pml_base_recvreq.h" - -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO #define RECVREQ_RECVERRSENT 0x01 #define RECVREQ_RNDVRESTART_RECVED 0x02 #define RECVREQ_RNDVRESTART_ACKED 0x04 -/* BFO FAILOVER CODE - end */ +#endif BEGIN_C_DECLS struct mca_pml_bfo_recv_request_t { mca_pml_base_recv_request_t req_recv; ompi_ptr_t remote_req_send; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO int32_t req_msgseq; /* PML sequence number */ int32_t req_events; /* number of outstanding events on request */ int32_t req_restartseq; /* sequence number of restarted request */ int32_t req_errstate; /* state of request if in error */ -/* BFO FAILOVER CODE - end */ +#endif int32_t req_lock; size_t req_pipeline_depth; size_t req_bytes_received; /**< amount of data transferred into the user buffer */ @@ -169,10 +168,10 @@ recv_request_pml_complete(mca_pml_bfo_recv_request_t *recvreq) } } recvreq->req_rdma_cnt = 0; -/* BFO FAILOVER CODE - begin */ - /* Initialize to a value that we indicate it is invalid */ - recvreq->req_msgseq = 42; -/* BFO FAILOVER CODE - end */ +#ifdef PML_BFO + /* Reset to a value that to indicate it is invalid. */ + recvreq->req_msgseq = recvreq->req_msgseq - 100; +#endif OPAL_THREAD_LOCK(&ompi_request_lock); if(true == recvreq->req_recv.req_base.req_free_called) { @@ -201,7 +200,11 @@ recv_request_pml_complete_check(mca_pml_bfo_recv_request_t *recvreq) #endif if(recvreq->req_match_received && recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed && +#ifdef PML_BFO (0 == recvreq->req_events) && lock_recv_request(recvreq)) { +#else + lock_recv_request(recvreq)) { +#endif recv_request_pml_complete(recvreq); return true; } @@ -236,9 +239,9 @@ static inline void recv_req_matched(mca_pml_bfo_recv_request_t *req, req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src; req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag; req->req_match_received = true; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO req->req_msgseq = hdr->hdr_seq; -/* BFO FAILOVER CODE - end */ +#endif #if OPAL_HAVE_THREAD_SUPPORT opal_atomic_wmb(); #endif diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.c b/ompi/mca/pml/bfo/pml_bfo_sendreq.c index 6a7925a12c..88888d5928 100644 --- a/ompi/mca/pml/bfo/pml_bfo_sendreq.c +++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.c @@ -31,9 +31,9 @@ #include "pml_bfo_sendreq.h" #include "pml_bfo_rdmafrag.h" #include "pml_bfo_recvreq.h" -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO #include "pml_bfo_failover.h" -/* BFO FAILOVER CODE - end */ +#endif #include "ompi/mca/bml/base/base.h" #include "ompi/memchecker.h" @@ -183,12 +183,18 @@ mca_pml_bfo_match_completion_free( struct mca_btl_base_module_t* btl, /* check completion status */ if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO mca_pml_bfo_repost_match_fragment(des); return; -/* BFO FAILOVER CODE - end */ +#else + /* TSW - FIX */ + opal_output(0, "%s:%d FATAL", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); +#endif } +#ifdef PML_BFO MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "MATCH"); +#endif mca_pml_bfo_match_completion_free_request( bml_btl, sendreq ); } @@ -229,13 +235,20 @@ mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl, /* check completion status */ if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { - MCA_PML_BFO_ERROR_ON_RNDV_COMPLETION(sendreq, des); +#ifdef PML_BFO + if (true == mca_pml_bfo_rndv_completion_status_error(des, sendreq)) + return; +#else + /* TSW - FIX */ + opal_output(0, "%s:%d FATAL", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); +#endif } - -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO sendreq->req_events--; - MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RNDV_COMPLETION(sendreq, status, btl); -/* BFO FAILOVER CODE - end */ + MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, + MCA_PML_BFO_HDR_TYPE_RNDV, "RNDV"); +#endif /* count bytes of user data actually delivered. As the rndv completion only * happens in one thread, the increase of the req_bytes_delivered does not @@ -246,7 +259,9 @@ mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl, sizeof(mca_pml_bfo_rendezvous_hdr_t), req_bytes_delivered ); +#ifdef PML_BFO MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "RNDV"); +#endif mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered ); } @@ -264,8 +279,9 @@ mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl, mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; size_t req_bytes_delivered = 0; - - MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RGET_COMPLETION(sendreq, btl, des); +#ifdef PML_BFO + MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des); +#endif /* count bytes of user data actually delivered and check for request completion */ MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt, @@ -274,8 +290,12 @@ mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl, send_request_pml_complete_check(sendreq); /* free the descriptor */ +#ifdef PML_BFO btl->btl_free(btl, des); MCA_PML_BFO_CHECK_SENDREQ_RDMA_BML_BTL(bml_btl, btl, sendreq, "RGET"); +#else + mca_bml_base_free(bml_btl, des); +#endif MCA_PML_BFO_PROGRESS_PENDING(bml_btl); } @@ -292,15 +312,15 @@ mca_pml_bfo_send_ctl_completion( mca_btl_base_module_t* btl, { mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; -/* BFO FAILOVER CODE - begin */ - mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; - if(OPAL_LIKELY(OMPI_SUCCESS == status)) { - /* check for pending requests */ - MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "ACK or RGET"); - MCA_PML_BFO_PROGRESS_PENDING(bml_btl); - } else { - MCA_PML_BFO_ERROR_ON_SEND_CTL_COMPLETION(sendreq, des); +#ifdef PML_BFO + if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) { + mca_pml_bfo_send_ctl_completion_status_error(des); + return; } + MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, des->des_cbdata, "RGET"); +#endif + /* check for pending requests */ + MCA_PML_BFO_PROGRESS_PENDING(bml_btl); } /** @@ -317,15 +337,19 @@ mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl, mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; size_t req_bytes_delivered = 0; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO sendreq->req_events--; -/* BFO FAILOVER CODE - end */ +#endif /* check completion status */ if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO sendreq->req_error++; -/* BFO FAILOVER CODE - end */ +#else + /* TSW - FIX */ + opal_output(0, "%s:%d FATAL", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); +#endif } /* count bytes of user data actually delivered */ @@ -337,52 +361,23 @@ mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl, OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); -/* BFO FAILOVER CODE - begin */ - /* note we check error after bytes delivered computation in case frag made it */ - if( OPAL_UNLIKELY(sendreq->req_error)) { - opal_output_verbose(30, mca_pml_bfo_output, - "FRAG: completion: sendreq has error, outstanding events=%d, " - "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", - sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, - sendreq->req_restartseq, (void *)sendreq, - sendreq->req_recv.pval, - status, sendreq->req_send.req_base.req_peer); - if (0 == sendreq->req_events) { - mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, - MCA_PML_BFO_HDR_TYPE_FRAG, - status, btl); - } - return; - } -/* BFO FAILOVER CODE - end */ +#ifdef PML_BFO + MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, + MCA_PML_BFO_HDR_TYPE_FRAG, "FRAG"); +#endif if(send_request_pml_complete_check(sendreq) == false) { mca_pml_bfo_send_request_schedule(sendreq); -/* BFO FAILOVER CODE - begin */ - if( OPAL_UNLIKELY(sendreq->req_error)) { - /* This situation can happen if the scheduling function - * determined that a BTL was removed from underneath us - * and therefore marked the request in error. In that - * case, the scheduling of fragments can no longer proceed - * properly. Therefore, if no outstanding events, initiate - * the restart dance. */ - opal_output_verbose(30, mca_pml_bfo_output, - "FRAG: completion: BTL has been removed, outstanding events=%d, " - "PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", - sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence, - sendreq->req_restartseq, (void *)sendreq, - sendreq->req_recv.pval, - status, sendreq->req_send.req_base.req_peer); - if (0 == sendreq->req_events) { - mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, - MCA_PML_BFO_HDR_TYPE_FRAG, - status, btl); - } - } -/* BFO FAILOVER CODE - end */ +#ifdef PML_BFO + MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, + MCA_PML_BFO_HDR_TYPE_FRAG, + "FRAG (BTL removal)"); +#endif } /* check for pending requests */ +#ifdef PML_BFO MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "FRAG"); +#endif MCA_PML_BFO_PROGRESS_PENDING(bml_btl); } @@ -438,7 +433,9 @@ int mca_pml_bfo_send_request_start_buffered( hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_rndv.hdr_src_req.pval = sendreq; +#ifdef PML_BFO MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV(buffered)"); +#endif bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc); @@ -487,11 +484,11 @@ int mca_pml_bfo_send_request_start_buffered( if( OPAL_LIKELY( 1 == rc ) ) { mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered); } -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { sendreq->req_events++; } -/* BFO FAILOVER CODE - end */ +#endif return OMPI_SUCCESS; } mca_bml_base_free(bml_btl, des ); @@ -537,14 +534,13 @@ int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq, MCA_PML_BFO_HDR_TYPE_MATCH, &des); if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) { -/* BFO FAILOVER CODE - begin */ - /* Needed for failover */ +#ifdef PML_BFO + /* Needed in case of failover */ if (NULL != des) { des->des_cbfunc = mca_pml_bfo_match_completion_free; des->des_cbdata = sendreq->req_endpoint; } -/* BFO FAILOVER CODE - end */ - +#endif /* signal request completion */ send_request_pml_complete(sendreq); @@ -774,9 +770,9 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq, hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_rndv.hdr_src_req.pval = sendreq; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RGET"); -/* BFO FAILOVER CODE - end */ +#endif hdr->hdr_rget.hdr_des.pval = src; hdr->hdr_rget.hdr_seg_cnt = src->des_src_cnt; @@ -826,9 +822,9 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq, hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_rndv.hdr_src_req.pval = sendreq; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV"); -/* BFO FAILOVER CODE - end */ +#endif bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc); @@ -852,12 +848,12 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq, if( OPAL_LIKELY( 1 == rc ) && (true == need_local_cb)) { mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, 0 ); } -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if ((des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) && (MCA_PML_BFO_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type)) { sendreq->req_events++; } -/* BFO FAILOVER CODE - end */ +#endif return OMPI_SUCCESS; } mca_bml_base_free(bml_btl, des); @@ -925,9 +921,9 @@ int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq, hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_rndv.hdr_src_req.pval = sendreq; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV"); -/* BFO FAILOVER CODE - end */ +#endif bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV, sendreq->req_send.req_base.req_proc); @@ -945,11 +941,11 @@ int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq, if( OPAL_LIKELY( 1 == rc ) ) { mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, size ); } -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { sendreq->req_events++; } -/* BFO FAILOVER CODE - end */ +#endif return OMPI_SUCCESS; } mca_bml_base_free(bml_btl, des ); @@ -1062,7 +1058,7 @@ mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t* sendreq) mca_bml_base_btl_t* bml_btl; assert(range->range_send_length != 0); -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO /* Failover code. If this is true, this means the request thinks we * have more BTLs than there really are. This can happen because * a BTL was removed from the available list. In this case, we @@ -1072,7 +1068,7 @@ mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t* sendreq) sendreq->req_error++; return OMPI_ERROR; } -/* BFO FAILOVER CODE - end */ +#endif if(prev_bytes_remaining == range->range_send_length) num_fail++; @@ -1181,11 +1177,11 @@ cannot_pack: range = get_next_send_range(sendreq, range); prev_bytes_remaining = 0; } -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { sendreq->req_events++; } -/* BFO FAILOVER CODE - end */ +#endif } else { mca_bml_base_free(bml_btl,des); } @@ -1209,28 +1205,33 @@ static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl, mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata; mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)frag->rdma_req; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; -/* BFO FAILOVER CODE - begin */ - sendreq->req_events--; -/* BFO FAILOVER CODE - end */ /* check completion status */ if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO sendreq->req_error++; -/* BFO FAILOVER CODE - end */ +#else + /* TSW - FIX */ + ORTE_ERROR_LOG(status); + orte_errmgr.abort(-1, NULL); +#endif } - -/* BFO FAILOVER CODE - begin */ - MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_PUT_COMPLETION(sendreq, status, btl); +#ifdef PML_BFO + sendreq->req_events--; + MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl); MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "RDMA write"); -/* BFO FAILOVER CODE - end */ +#endif mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc, bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des, - des->order, 0, (uint16_t)sendreq->req_send.req_base.req_sequence, - sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, - sendreq->req_send.req_base.req_comm->c_my_rank); +#ifdef PML_BFO + des->order, 0, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank); +#else + des->order, 0); +#endif /* check for request completion */ OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); @@ -1275,9 +1276,13 @@ int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag ) /* tell receiver to unregister memory */ mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc, bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des, - MCA_BTL_NO_ORDER, 1, (uint16_t)sendreq->req_send.req_base.req_sequence, - sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, - sendreq->req_send.req_base.req_comm->c_my_rank); +#ifdef PML_BFO + MCA_BTL_NO_ORDER, 1, (uint16_t)sendreq->req_send.req_base.req_sequence, + sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, + sendreq->req_send.req_base.req_comm->c_my_rank); +#else + MCA_BTL_NO_ORDER, 1); +#endif /* send fragment by copy in/out */ mca_pml_bfo_send_request_copy_in_out(sendreq, @@ -1313,14 +1318,11 @@ int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag ) orte_errmgr.abort(-1, NULL); } } -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { - mca_pml_bfo_send_request_t *sendreq = - (mca_pml_bfo_send_request_t*)frag->rdma_req; - sendreq->req_events++; + ((mca_pml_bfo_send_request_t*)frag->rdma_req)->req_events++; } -/* BFO FAILOVER CODE - end */ - +#endif return OMPI_SUCCESS; } @@ -1341,18 +1343,20 @@ void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq, size_t i, size = 0; if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_TYPE_ACK) { -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO /* Handle the failover case where a RNDV request may * have turned into a RGET and therefore the state * is not being tracked. */ if (sendreq->req_state != 0) { OPAL_THREAD_ADD32(&sendreq->req_state, -1); } -/* BFO FAILOVER CODE - end */ +#else + OPAL_THREAD_ADD32(&sendreq->req_state, -1); +#endif } -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO sendreq->req_recv = hdr->hdr_dst_req; /* only needed once, but it is OK */ -/* BFO FAILOVER CODE - end */ +#endif MCA_PML_BFO_RDMA_FRAG_ALLOC(frag, rc); @@ -1380,21 +1384,10 @@ void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq, } frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO frag->rdma_btl = btl; - if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { - opal_output(0, "[%s:%d] invalid bml for rdma put", __FILE__, __LINE__); - MCA_PML_BFO_RDMA_FRAG_RETURN(frag); - sendreq->req_error++; - if (0 == sendreq->req_events) { - opal_output(0, "[%s:%d] Issuing rndvrestartnotify", __FILE__, __LINE__); - mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, - MCA_PML_BFO_HDR_TYPE_PUT, - OMPI_ERROR, btl); - } - return; - } -/* BFO FAILOVER CODE - end */ + MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl); +#endif frag->rdma_hdr.hdr_rdma = *hdr; frag->rdma_req = sendreq; frag->rdma_ep = bml_endpoint; diff --git a/ompi/mca/pml/bfo/pml_bfo_sendreq.h b/ompi/mca/pml/bfo/pml_bfo_sendreq.h index c77f8a72c5..75cd106652 100644 --- a/ompi/mca/pml/bfo/pml_bfo_sendreq.h +++ b/ompi/mca/pml/bfo/pml_bfo_sendreq.h @@ -42,12 +42,12 @@ struct mca_pml_bfo_send_request_t { mca_pml_base_send_request_t req_send; mca_bml_base_endpoint_t* req_endpoint; ompi_ptr_t req_recv; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO int32_t req_events; /* number of outstanding events on request */ int32_t req_restartseq; /* sequence number of restarted request */ int32_t req_restart; /* state of restarted request */ int32_t req_error; /* non-zero when error has occurred on request */ -/* BFO FAILOVER CODE - end */ +#endif int32_t req_state; int32_t req_lock; bool req_throttle_sends; @@ -249,7 +249,7 @@ send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq) MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true); } sendreq->req_send.req_base.req_pml_complete = true; -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO assert(0 == sendreq->req_events); sendreq->req_restartseq = 0; /* Since sequence numbers increase monotonically and @@ -258,7 +258,7 @@ send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq) * as that is not within the valid range. */ sendreq->req_send.req_base.req_sequence = sendreq->req_send.req_base.req_sequence - 10; -/* BFO FAILOVER CODE - end */ +#endif if(sendreq->req_send.req_base.req_free_called) { MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq); @@ -437,12 +437,12 @@ mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq ) sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE; sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32( &comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1); -/* BFO FAILOVER CODE - begin */ +#ifdef PML_BFO sendreq->req_restartseq = 0; /* counts up restarts */ sendreq->req_restart = 0; /* reset in case we restart again */ sendreq->req_error = 0; /* clear error state */ sendreq->req_events = 0; /* clear events, probably 0 anyways */ -/* BFO FAILOVER CODE - end */ +#endif MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );