1
1

Some more refactoring in the BFO PML. Getting it

as close to OB1 PML as possible.

This commit was SVN r23920.
Этот коммит содержится в:
Rolf vandeVaart 2010-10-22 18:13:35 +00:00
родитель 1766bf271a
Коммит 148ed00dd1
10 изменённых файлов: 383 добавлений и 310 удалений

Просмотреть файл

@ -48,9 +48,9 @@
#include "pml_bfo_sendreq.h" #include "pml_bfo_sendreq.h"
#include "pml_bfo_recvreq.h" #include "pml_bfo_recvreq.h"
#include "pml_bfo_rdmafrag.h" #include "pml_bfo_rdmafrag.h"
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
#include "pml_bfo_failover.h" #include "pml_bfo_failover.h"
/* BFO FAILOVER CODE - end */ #endif
mca_pml_bfo_t mca_pml_bfo = { mca_pml_bfo_t mca_pml_bfo = {
{ {
@ -409,13 +409,12 @@ int mca_pml_bfo_add_procs(ompi_proc_t** procs, size_t nprocs)
NULL ); NULL );
if(OMPI_SUCCESS != rc) if(OMPI_SUCCESS != rc)
goto cleanup_and_return; goto cleanup_and_return;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
rc = mca_pml_bfo_register_callbacks(); rc = mca_pml_bfo_register_callbacks();
if(OMPI_SUCCESS != rc) if(OMPI_SUCCESS != rc)
goto cleanup_and_return; goto cleanup_and_return;
/* BFO FAILOVER CODE - end */ #endif
/* register error handlers */ /* register error handlers */
rc = mca_bml.bml_register_error(mca_pml_bfo_error_handler); rc = mca_bml.bml_register_error(mca_pml_bfo_error_handler);
if(OMPI_SUCCESS != rc) if(OMPI_SUCCESS != rc)
@ -472,14 +471,13 @@ static void mca_pml_bfo_fin_completion( mca_btl_base_module_t* btl,
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
mca_pml_bfo_repost_fin(des); mca_pml_bfo_repost_fin(des);
return; return;
} }
MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des); MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des);
/* BFO FAILOVER CODE - end */ #endif
/* check for pending requests */ /* check for pending requests */
MCA_PML_BFO_PROGRESS_PENDING(bml_btl); MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
} }
@ -494,12 +492,14 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl, mca_bml_base_btl_t* bml_btl,
ompi_ptr_t hdr_des, ompi_ptr_t hdr_des,
uint8_t order, uint8_t order,
#ifdef PML_BFO
uint32_t status, uint32_t status,
/* BFO FAILOVER CODE - begin */
uint16_t seq, uint16_t seq,
uint8_t restartseq, uint8_t restartseq,
uint16_t ctx, uint32_t src) uint16_t ctx, uint32_t src)
/* BFO FAILOVER CODE - end */ #else
uint32_t status )
#endif
{ {
mca_btl_base_descriptor_t* fin; mca_btl_base_descriptor_t* fin;
mca_pml_bfo_fin_hdr_t* hdr; mca_pml_bfo_fin_hdr_t* hdr;
@ -521,13 +521,13 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc,
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN; hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des; hdr->hdr_des = hdr_des;
hdr->hdr_fail = status; hdr->hdr_fail = status;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
fin->des_cbdata = proc; fin->des_cbdata = proc;
hdr->hdr_match.hdr_seq = seq; hdr->hdr_match.hdr_seq = seq;
hdr->hdr_match.hdr_ctx = ctx; hdr->hdr_match.hdr_ctx = ctx;
hdr->hdr_match.hdr_src = src; hdr->hdr_match.hdr_src = src;
hdr->hdr_match.hdr_common.hdr_flags = restartseq; /* use unused hdr_flags field */ hdr->hdr_match.hdr_common.hdr_flags = restartseq; /* use unused hdr_flags field */
/* BFO FAILOVER CODE - end */ #endif
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FIN, proc); bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FIN, proc);
@ -594,11 +594,15 @@ void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl)
rc = mca_pml_bfo_send_fin(pckt->proc, send_dst, rc = mca_pml_bfo_send_fin(pckt->proc, send_dst,
pckt->hdr.hdr_fin.hdr_des, pckt->hdr.hdr_fin.hdr_des,
pckt->order, pckt->order,
#ifdef PML_BFO
pckt->hdr.hdr_fin.hdr_fail, pckt->hdr.hdr_fin.hdr_fail,
pckt->hdr.hdr_fin.hdr_match.hdr_seq, pckt->hdr.hdr_fin.hdr_match.hdr_seq,
pckt->hdr.hdr_fin.hdr_match.hdr_common.hdr_flags, pckt->hdr.hdr_fin.hdr_match.hdr_common.hdr_flags,
pckt->hdr.hdr_fin.hdr_match.hdr_ctx, pckt->hdr.hdr_fin.hdr_match.hdr_ctx,
pckt->hdr.hdr_fin.hdr_match.hdr_src); pckt->hdr.hdr_fin.hdr_match.hdr_src);
#else
pckt->hdr.hdr_fin.hdr_fail);
#endif
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) { if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
return; return;
} }
@ -640,14 +644,13 @@ void mca_pml_bfo_process_pending_rdma(void)
void mca_pml_bfo_error_handler( void mca_pml_bfo_error_handler(
struct mca_btl_base_module_t* btl, int32_t flags, struct mca_btl_base_module_t* btl, int32_t flags,
ompi_proc_t* errproc, char* btlinfo ) { ompi_proc_t* errproc, char* btlinfo ) {
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
/* If we get a non-fatal error, try to failover */
if (flags & MCA_BTL_ERROR_FLAGS_NONFATAL) { if (flags & MCA_BTL_ERROR_FLAGS_NONFATAL) {
mca_pml_bfo_failover_error_handler(btl, flags, errproc, btlinfo); mca_pml_bfo_failover_error_handler(btl, flags, errproc, btlinfo);
/* BFO FAILOVER CODE - end */ return;
} else {
orte_errmgr.abort(-1, NULL);
} }
#endif
orte_errmgr.abort(-1, NULL);
} }
#if OPAL_ENABLE_FT_CR == 0 #if OPAL_ENABLE_FT_CR == 0

Просмотреть файл

@ -23,6 +23,7 @@
#ifndef MCA_PML_BFO_H #ifndef MCA_PML_BFO_H
#define MCA_PML_BFO_H #define MCA_PML_BFO_H
#define PML_BFO 1
#include "ompi_config.h" #include "ompi_config.h"
#include "ompi/class/ompi_free_list.h" #include "ompi/class/ompi_free_list.h"
#include "ompi/request/request.h" #include "ompi/request/request.h"
@ -225,8 +226,12 @@ do { \
int mca_pml_bfo_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, int mca_pml_bfo_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
#ifdef PML_BFO
ompi_ptr_t hdr_des, uint8_t order, uint32_t status, ompi_ptr_t hdr_des, uint8_t order, uint32_t status,
uint16_t seq, uint8_t reqseq, uint16_t ctx, uint32_t src); uint16_t seq, uint8_t reqseq, uint16_t ctx, uint32_t src);
#else
ompi_ptr_t hdr_des, uint8_t order, uint32_t status);
#endif
/* This function tries to resend FIN/ACK packets from pckt_pending queue. /* This function tries to resend FIN/ACK packets from pckt_pending queue.
* Packets are added to the queue when sending of FIN or ACK is failed due to * Packets are added to the queue when sending of FIN or ACK is failed due to

Просмотреть файл

@ -2103,3 +2103,102 @@ void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
} }
} }
} }
/**
* The completion event for the RNDV message has returned with an
* error. We know that the send request we are looking at is valid
* because it cannot be completed until the sendreq->req_state value
* reaches 0. And for the sendreq->req_state to reach 0, the
* completion event on the RNDV message must occur. So, we do not
* bother checking whether the send request is valid, because we know
* it is, but we put a few asserts in for good measure. We then check
* a few fields in the request to decide what to do. If the
* sendreq->req_error is set, that means that something has happend
* already to the request and we do not want to restart it.
* Presumably, we may have received a RECVERRNOTIFY message from the
* receiver. We also check the sendreq->req_acked field to see if it
* has been acked. If it has, then again we do not restart everything
* because obviously the RNDV message has made it to the other side.
*/
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
mca_pml_bfo_send_request_t* sendreq)
{
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_ctx ==
(sendreq)->req_send.req_base.req_comm->c_contextid);
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_src ==
(sendreq)->req_send.req_base.req_comm->c_my_rank);
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_seq ==
(uint16_t)(sendreq)->req_send.req_base.req_sequence);
if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) {
(sendreq)->req_events--;
/* Assume RNDV did not make it, so restart from the beginning. */
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDV);
return true;
}
return false;
}
/**
* Check to see if an error has occurred on this send request. If it has
* and there are no outstanding events, then we can start the restart dance.
*/
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendreq,
int status,
mca_btl_base_module_t* btl,
int type,
char *description)
{
opal_output_verbose(30, mca_pml_bfo_output,
"%s: completion: sendreq has error, outstanding events=%d, "
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
description,
sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, (void *)sendreq,
sendreq->req_recv.pval,
status, sendreq->req_send.req_base.req_peer);
if (0 == sendreq->req_events) {
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
type, status, btl);
}
}
/* If we get an error on the RGET message, then first make sure that
* header matches the send request that we are pointing to. This is
* necessary, because even though the sending side got an error, the
* RGET may have made it to the receiving side and the message transfer
* may have completed. This would then mean the send request has been
* completed and perhaps in use by another communication. So there is
* no need to restart this request. Therefore, ensure that we are
* looking at the same request that the header thinks we are looking
* at. If not, then there is nothing else to be done. */
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des)
{
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
mca_pml_bfo_hdr_t* hdr = des->des_src->seg_addr.pval;
switch (hdr->hdr_common.hdr_type) {
case MCA_PML_BFO_HDR_TYPE_RGET:
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
(hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_comm->c_my_rank) ||
(hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) {
opal_output_verbose(30, mca_pml_bfo_output,
"RGET: completion event: dropping because no valid request "
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
"RQS:exp=%d,act=%d, dst_req=%p",
(uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_match.hdr_seq,
sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx,
sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src,
sendreq->req_restartseq, hdr->hdr_rndv.hdr_restartseq,
(void *)sendreq);
return;
}
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RGET);
return;
default:
opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)",
__FILE__, __LINE__, hdr->hdr_common.hdr_type);
orte_errmgr.abort(-1, NULL);
}
}

Просмотреть файл

@ -105,6 +105,16 @@ void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
mca_pml_bfo_recv_request_t* recvreq, mca_pml_bfo_recv_request_t* recvreq,
char* type); char* type);
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
mca_pml_bfo_send_request_t* sendreq);
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des);
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendreq,
int status,
mca_btl_base_module_t* btl,
int type,
char *description);
/** /**
* Four new callbacks for the four new message types. * Four new callbacks for the four new message types.
*/ */
@ -243,57 +253,33 @@ extern void mca_pml_bfo_recv_frag_callback_recverrnotify( mca_btl_base_module_t
* Macros for pml_bfo_sendreq.c file. * Macros for pml_bfo_sendreq.c file.
*/ */
/* The completion event for the RNDV message has returned with an
* error. We know that the send request we are looking at is valid
* because it cannot be completed until the sendreq->req_state value
* reaches 0. And for the sendreq->req_state to reach 0, the
* completion event on the RNDV message must occur. So, we do not
* bother checking whether the send request is valid, because we know
* it is, but we put a few asserts in for good measure. We then check
* a few fields in the request to decide what to do. If the
* sendreq->req_error is set, that means that something has happend
* already to the request and we do not want to restart it.
* Presumably, we may have received a RECVERRNOTIFY message from the
* receiver. We also check the sendreq->req_acked field to see if it
* has been acked. If it has, then again we do not restart everything
* because obviously the RNDV message has made it to the other side. */
#define MCA_PML_BFO_ERROR_ON_RNDV_COMPLETION(sendreq, des) \
do { \
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_ctx == \
(sendreq)->req_send.req_base.req_comm->c_contextid); \
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_src == \
(sendreq)->req_send.req_base.req_comm->c_my_rank); \
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_seq == \
(uint16_t)(sendreq)->req_send.req_base.req_sequence); \
if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) { \
(sendreq)->req_events--; \
/* Assume RNDV did not make it, so restart from the beginning. */ \
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDV); \
return; \
} \
} while (0)
/* Now check the error state. This request can be in error if the /* Now check the error state. This request can be in error if the
* RNDV message made it over, but the receiver got an error trying to * RNDV message made it over, but the receiver got an error trying to
* send the ACK back and therefore sent a RECVERRNOTIFY message. In * send the ACK back and therefore sent a RECVERRNOTIFY message. In
* that case, we want to start the restart dance as the receiver has * that case, we want to start the restart dance as the receiver has
* matched this message already. Only restart if there are no * matched this message already. Only restart if there are no
* outstanding events on send request. */ * outstanding events on send request. */
#define MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RNDV_COMPLETION(sendreq, status, btl) \ #define MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
if ((sendreq)->req_error) { \ if( OPAL_UNLIKELY ((sendreq)->req_error)) { \
opal_output_verbose(30, mca_pml_bfo_output, \ mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
"RNDV: completion: sendreq has error, outstanding events=%d, " \ btl, type, description); \
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", \ return; \
(sendreq)->req_events, \ }
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
(sendreq)->req_restartseq, (unsigned long)(sendreq), \ /**
(unsigned long)(sendreq)->req_recv.pval, \ * This macro is called within the frag completion function in two
status, (sendreq)->req_send.req_base.req_peer); \ * places. It is called to see if any errors occur prior to the
if (0 == (sendreq)->req_events) { \ * completion event on the frag. It is then called a second time
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \ * after the scheduling routine is called as the scheduling routine
MCA_PML_BFO_HDR_TYPE_RNDV, \ * may have detected that a BTL that was cached on the request had
status, btl); \ * been removed and therefore marked the request in error. In that
} \ * case, the scheduling of fragments can no longer proceed properly,
* and if there are no outstanding events, iniated the restart dance.
*/
#define MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
btl, type, description); \
return; \ return; \
} }
@ -302,7 +288,7 @@ do {
* field is not checked here. That is because that is the value * field is not checked here. That is because that is the value
* returned in the FIN hdr.hdr_fail field and may be used for other * returned in the FIN hdr.hdr_fail field and may be used for other
* things. */ * things. */
#define MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RGET_COMPLETION(sendreq, btl, des) \ #define MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des) \
if( OPAL_UNLIKELY(sendreq->req_error)) { \ if( OPAL_UNLIKELY(sendreq->req_error)) { \
opal_output_verbose(30, mca_pml_bfo_output, \ opal_output_verbose(30, mca_pml_bfo_output, \
"FIN: received on broken request, skipping, " \ "FIN: received on broken request, skipping, " \
@ -314,66 +300,15 @@ do {
return; \ return; \
} }
/* If we get an error on the RGET message, then first make sure that
* header matches the send request that we are pointing to. This is
* necessary, because even though the sending side got an error, the
* RGET may have made it to the receiving side and the message transfer
* may have completed. This would then mean the send request has been
* completed and perhaps in use by another communication. So there is
* no need to restart this request. Therefore, ensure that we are
* looking at the same request that the header thinks we are looking
* at. If not, then there is nothing else to be done. */
#define MCA_PML_BFO_ERROR_ON_SEND_CTL_COMPLETION(sendreq, des) \
do { \
mca_pml_bfo_hdr_t* hdr = des->des_src->seg_addr.pval; \
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; \
switch (hdr->hdr_common.hdr_type) { \
case MCA_PML_BFO_HDR_TYPE_RGET: \
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || \
(hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_comm->c_my_rank) || \
(hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RGET: completion event: dropping because no valid request " \
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " \
"RQS:exp=%d,act=%d, dst_req=%p", \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
hdr->hdr_match.hdr_seq, \
sendreq->req_send.req_base.req_comm->c_contextid, \
hdr->hdr_match.hdr_ctx, \
sendreq->req_send.req_base.req_comm->c_my_rank, \
hdr->hdr_match.hdr_src, \
sendreq->req_restartseq, hdr->hdr_rndv.hdr_restartseq, \
(void *)sendreq); \
return; \
} \
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RGET); \
return; \
default: \
opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)", \
__FILE__, __LINE__, hdr->hdr_common.hdr_type); \
orte_errmgr.abort(-1, NULL); \
} \
} while (0)
/* Check if there has been an error on the send request when we get /* Check if there has been an error on the send request when we get
* a completion event on the RDMA write. */ * a completion event on the RDMA write. */
#define MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_PUT_COMPLETION(sendreq, status, btl) \ #define MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl) \
if ( OPAL_UNLIKELY(sendreq->req_error)) { \ if ( OPAL_UNLIKELY(sendreq->req_error)) { \
opal_output_verbose(30, mca_pml_bfo_output, \ mca_pml_bfo_completion_sendreq_has_error(sendreq, status, btl, \
"RDMA write: completion: sendreq has error, outstanding events=%d," \ MCA_PML_BFO_HDR_TYPE_PUT, "RDMA write"); \
" PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", \ MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
sendreq->req_events, \ return; \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
sendreq->req_restartseq, (void *)sendreq, \
sendreq->req_recv.pval, \
status, sendreq->req_send.req_base.req_peer); \
if (0 == sendreq->req_events) { \
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \
MCA_PML_BFO_HDR_TYPE_PUT, \
status, btl); \
} \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
return; \
} }
#define MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, type) \ #define MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, type) \
@ -416,6 +351,28 @@ do {
mca_pml_bfo_update_eager_bml_btl_recv_ctl(&bml_btl, btl, des); \ mca_pml_bfo_update_eager_bml_btl_recv_ctl(&bml_btl, btl, des); \
} }
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl) \
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"PUT received: no matching BTL to RDMA write to, oustanding " \
"events=%d, PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
sendreq->req_events, \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
sendreq->req_restartseq, (void *)sendreq, \
sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
sendreq->req_error++; \
if (0 == sendreq->req_events) { \
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \
MCA_PML_BFO_HDR_TYPE_PUT, \
OMPI_ERROR, btl); \
} \
return; \
}
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -36,7 +36,9 @@ typedef enum {
struct mca_pml_bfo_rdma_frag_t { struct mca_pml_bfo_rdma_frag_t {
ompi_free_list_item_t super; ompi_free_list_item_t super;
mca_bml_base_btl_t* rdma_bml; mca_bml_base_btl_t* rdma_bml;
#ifdef PML_BFO
mca_btl_base_module_t* rdma_btl; mca_btl_base_module_t* rdma_btl;
#endif
mca_pml_bfo_hdr_t rdma_hdr; mca_pml_bfo_hdr_t rdma_hdr;
mca_pml_bfo_rdma_state_t rdma_state; mca_pml_bfo_rdma_state_t rdma_state;
size_t rdma_length; size_t rdma_length;

Просмотреть файл

@ -41,9 +41,9 @@
#include "pml_bfo_recvreq.h" #include "pml_bfo_recvreq.h"
#include "pml_bfo_sendreq.h" #include "pml_bfo_sendreq.h"
#include "pml_bfo_hdr.h" #include "pml_bfo_hdr.h"
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
#include "pml_bfo_failover.h" #include "pml_bfo_failover.h"
/* BFO FAILOVER CODE - end */ #endif
OBJ_CLASS_INSTANCE( mca_pml_bfo_buffer_t, OBJ_CLASS_INSTANCE( mca_pml_bfo_buffer_t,
ompi_free_list_item_t, ompi_free_list_item_t,
@ -242,13 +242,13 @@ void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl,
slow_path: slow_path:
OPAL_THREAD_UNLOCK(&comm->matching_lock); OPAL_THREAD_UNLOCK(&comm->matching_lock);
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
/* Check for duplicate messages. If message is duplicate, then just /* Check for duplicate messages. If message is duplicate, then just
* return as that essentially drops the message. */ * return as that essentially drops the message. */
if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) { if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
return; return;
} }
/* BFO FAILOVER CODE - end */ #endif
mca_pml_bfo_recv_frag_match(btl, hdr, segments, mca_pml_bfo_recv_frag_match(btl, hdr, segments,
num_segments, MCA_PML_BFO_HDR_TYPE_MATCH); num_segments, MCA_PML_BFO_HDR_TYPE_MATCH);
} }
@ -306,9 +306,9 @@ void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl,
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_ACK); bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_ACK);
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_ack.hdr_src_req.pval; sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
sendreq->req_recv = hdr->hdr_ack.hdr_dst_req; sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq) MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq)
/* BFO FAILOVER CODE - end */ #endif
/* if the request should be delivered entirely by copy in/out /* if the request should be delivered entirely by copy in/out
* then throttle sends */ * then throttle sends */
@ -352,9 +352,9 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
} }
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG); bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG);
recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq) MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq)
/* BFO FAILOVER CODE - end */ #endif
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt); mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
return; return;
@ -375,9 +375,9 @@ void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl,
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_PUT); bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_PUT);
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_rdma.hdr_req.pval; sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq) MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq)
/* BFO FAILOVER CODE - end */ #endif
mca_pml_bfo_send_request_put(sendreq,btl,&hdr->hdr_rdma); mca_pml_bfo_send_request_put(sendreq,btl,&hdr->hdr_rdma);
return; return;
@ -398,11 +398,11 @@ void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl,
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FIN); bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FIN);
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval; rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if (true == mca_pml_bfo_is_duplicate_fin(hdr, rdma, btl)) { if (true == mca_pml_bfo_is_duplicate_fin(hdr, rdma, btl)) {
return; return;
} }
/* BFO FAILOVER CODE - end */ #endif
rdma->des_cbfunc(btl, NULL, rdma, rdma->des_cbfunc(btl, NULL, rdma,
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS); hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
@ -626,7 +626,7 @@ static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl,
* the fragment. * the fragment.
*/ */
OPAL_THREAD_LOCK(&comm->matching_lock); OPAL_THREAD_LOCK(&comm->matching_lock);
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
/* In case of network failover, we may get a message telling us to /* In case of network failover, we may get a message telling us to
* restart. In that case, we already have a pointer to the receive * restart. In that case, we already have a pointer to the receive
* request in the header itself. */ * request in the header itself. */
@ -635,8 +635,8 @@ static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl,
if (NULL == match) { if (NULL == match) {
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* BFO FAILOVER CODE - end */
} else { } else {
#endif
/* get sequence number of next message that can be processed */ /* get sequence number of next message that can be processed */
next_msg_seq_expected = (uint16_t)proc->expected_sequence; next_msg_seq_expected = (uint16_t)proc->expected_sequence;
@ -673,8 +673,9 @@ out_of_order_match:
/* release matching lock before processing fragment */ /* release matching lock before processing fragment */
OPAL_THREAD_UNLOCK(&comm->matching_lock); OPAL_THREAD_UNLOCK(&comm->matching_lock);
#ifdef PML_BFO
} }
#endif
if(OPAL_LIKELY(match)) { if(OPAL_LIKELY(match)) {
switch(type) { switch(type) {
case MCA_PML_BFO_HDR_TYPE_MATCH: case MCA_PML_BFO_HDR_TYPE_MATCH:
@ -716,14 +717,13 @@ wrong_seq:
* This message comes after the next expected, so it * This message comes after the next expected, so it
* is ahead of sequence. Save it for later. * is ahead of sequence. Save it for later.
*/ */
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
/* Check for duplicate messages. If message is duplicate, then just /* Check for duplicate messages. If message is duplicate, then just
* return as that essentially drops the message. */ * return as that essentially drops the message. */
if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) { if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
/* BFO FAILOVER CODE - end */ #endif
append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments, append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments,
num_segments, NULL); num_segments, NULL);
OPAL_THREAD_UNLOCK(&comm->matching_lock); OPAL_THREAD_UNLOCK(&comm->matching_lock);

Просмотреть файл

@ -33,9 +33,9 @@
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "opal/util/arch.h" #include "opal/util/arch.h"
#include "ompi/memchecker.h" #include "ompi/memchecker.h"
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
#include "pml_bfo_failover.h" #include "pml_bfo_failover.h"
/* BFO FAILOVER CODE - end */ #endif
void mca_pml_bfo_recv_request_process_pending(void) void mca_pml_bfo_recv_request_process_pending(void)
{ {
@ -170,13 +170,13 @@ static void mca_pml_bfo_recv_ctl_completion( mca_btl_base_module_t* btl,
{ {
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) { if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) {
mca_pml_bfo_check_recv_ctl_completion_status(btl, des, status); mca_pml_bfo_check_recv_ctl_completion_status(btl, des, status);
} }
MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL_RECV_CTL(bml_btl, btl, des); MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL_RECV_CTL(bml_btl, btl, des);
/* BFO FAILOVER CODE - end */ #endif
MCA_PML_BFO_PROGRESS_PENDING(bml_btl); MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
} }
@ -199,12 +199,13 @@ static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl,
} }
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
#ifdef PML_BFO
btl->btl_free(btl, des); btl->btl_free(btl, des);
/* BFO FAILOVER CODE - begin */
MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq); MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq);
MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL(bml_btl, btl, recvreq, "PUT"); MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL(bml_btl, btl, recvreq, "PUT");
/* BFO FAILOVER CODE - end */ #else
mca_bml_base_free(bml_btl, des);
#endif
/* check completion status */ /* check completion status */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
@ -249,18 +250,18 @@ int mca_pml_bfo_recv_request_ack_send_btl(
/* initialize descriptor */ /* initialize descriptor */
des->des_cbfunc = mca_pml_bfo_recv_ctl_completion; des->des_cbfunc = mca_pml_bfo_recv_ctl_completion;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
des->des_cbdata = (void *)proc; des->des_cbdata = (void *)proc;
/* BFO FAILOVER CODE - end */ #endif
rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_ACK); rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_ACK);
if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( rc >= 0 ) ) {
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) && if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) &&
(des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
((mca_pml_bfo_recv_request_t *)hdr_dst_req)->req_events++; ((mca_pml_bfo_recv_request_t *)hdr_dst_req)->req_events++;
} }
/* BFO FAILOVER CODE - end */ #endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
mca_bml_base_free(bml_btl, des); mca_bml_base_free(bml_btl, des);
@ -346,29 +347,39 @@ static void mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata; mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata;
mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req; mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) { if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) {
recvreq->req_events--; recvreq->req_events--;
assert(recvreq->req_events >= 0); assert(recvreq->req_events >= 0);
} }
/* BFO FAILOVER CODE - end */ #endif
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
#ifdef PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq); MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq);
#else
/* TSW - FIX */
ORTE_ERROR_LOG(status);
orte_errmgr.abort(-1, NULL);
#endif
} }
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
MCA_PML_BFO_SECOND_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq, status, btl); MCA_PML_BFO_SECOND_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq, status, btl);
MCA_PML_BFO_CHECK_RECVREQ_RDMA_BML_BTL(bml_btl, btl, recvreq, "RDMA write"); MCA_PML_BFO_CHECK_RECVREQ_RDMA_BML_BTL(bml_btl, btl, recvreq, "RDMA write");
/* BFO FAILOVER CODE - end */ #endif
mca_pml_bfo_send_fin(recvreq->req_recv.req_base.req_proc, mca_pml_bfo_send_fin(recvreq->req_recv.req_base.req_proc,
bml_btl, bml_btl,
frag->rdma_hdr.hdr_rget.hdr_des, frag->rdma_hdr.hdr_rget.hdr_des,
des->order, 0, (uint16_t)recvreq->req_msgseq, recvreq->req_restartseq, #ifdef PML_BFO
recvreq->req_recv.req_base.req_comm->c_contextid, des->order, 0, (uint16_t)recvreq->req_msgseq, recvreq->req_restartseq,
recvreq->req_recv.req_base.req_comm->c_my_rank); recvreq->req_recv.req_base.req_comm->c_contextid,
recvreq->req_recv.req_base.req_comm->c_my_rank);
#else
des->order, 0);
#endif
/* is receive request complete */ /* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
@ -432,12 +443,12 @@ int mca_pml_bfo_recv_request_get_frag( mca_pml_bfo_rdma_frag_t* frag )
orte_errmgr.abort(-1, NULL); orte_errmgr.abort(-1, NULL);
} }
} }
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) && if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) &&
(descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
recvreq->req_events++; recvreq->req_events++;
} }
/* BFO FAILOVER CODE - end */ #endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -519,9 +530,9 @@ void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq
0, bytes_received ); 0, bytes_received );
recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length; recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req; recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
/* BFO FAILOVER CODE - end */ #endif
MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match); MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
/* if receive buffer is not contiguous we can't just RDMA read into it, so /* if receive buffer is not contiguous we can't just RDMA read into it, so
@ -556,9 +567,9 @@ void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq
size += hdr->hdr_segs[i].seg_len; size += hdr->hdr_segs[i].seg_len;
} }
} }
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
frag->rdma_btl = btl; frag->rdma_btl = btl;
/* BFO FAILOVER CODE - end */ #endif
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__); opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__);
@ -828,9 +839,9 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
continue; continue;
} }
ctl->des_cbfunc = mca_pml_bfo_recv_ctl_completion; ctl->des_cbfunc = mca_pml_bfo_recv_ctl_completion;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
ctl->des_cbdata = recvreq; ctl->des_cbdata = recvreq;
/* BFO FAILOVER CODE - end */ #endif
/* fill in rdma header */ /* fill in rdma header */
hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_src->seg_addr.pval; hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_src->seg_addr.pval;
@ -838,9 +849,9 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
hdr->hdr_common.hdr_flags = hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0; (!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0;
hdr->hdr_req = recvreq->remote_req_send; hdr->hdr_req = recvreq->remote_req_send;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
hdr->hdr_dst_req.pval = recvreq; /* only needed in the first put message */ hdr->hdr_dst_req.pval = recvreq; /* only needed in the first put message */
/* BFO FAILOVER CODE - end */ #endif
hdr->hdr_des.pval = dst; hdr->hdr_des.pval = dst;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset; hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_seg_cnt = dst->des_dst_cnt; hdr->hdr_seg_cnt = dst->des_dst_cnt;
@ -862,12 +873,12 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
/* send rdma request to peer */ /* send rdma request to peer */
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_BFO_HDR_TYPE_PUT); rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_BFO_HDR_TYPE_PUT);
if( OPAL_LIKELY( rc >= 0 ) ) { if( OPAL_LIKELY( rc >= 0 ) ) {
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if ((btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) && if ((btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) &&
(ctl->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) { (ctl->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
recvreq->req_events++; recvreq->req_events++;
} }
/* BFO FAILOVER CODE - end */ #endif
/* update request state */ /* update request state */
recvreq->req_rdma_offset += size; recvreq->req_rdma_offset += size;
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
@ -993,12 +1004,12 @@ void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req)
req->req_bytes_received = 0; req->req_bytes_received = 0;
req->req_bytes_expected = 0; req->req_bytes_expected = 0;
/* What about req_rdma_cnt ? */ /* What about req_rdma_cnt ? */
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
req->req_rdma_cnt = 0; req->req_rdma_cnt = 0;
req->req_events = 0; req->req_events = 0;
req->req_restartseq = 0; req->req_restartseq = 0;
req->req_errstate = 0; req->req_errstate = 0;
/* BFO FAILOVER CODE - end */ #endif
req->req_rdma_idx = 0; req->req_rdma_idx = 0;
req->req_pending = false; req->req_pending = false;
req->req_ack_sent = false; req->req_ack_sent = false;

Просмотреть файл

@ -30,24 +30,23 @@
#include "ompi/mca/pml/bfo/pml_bfo_comm.h" #include "ompi/mca/pml/bfo/pml_bfo_comm.h"
#include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/pml/base/pml_base_recvreq.h" #include "ompi/mca/pml/base/pml_base_recvreq.h"
#ifdef PML_BFO
/* BFO FAILOVER CODE - begin */
#define RECVREQ_RECVERRSENT 0x01 #define RECVREQ_RECVERRSENT 0x01
#define RECVREQ_RNDVRESTART_RECVED 0x02 #define RECVREQ_RNDVRESTART_RECVED 0x02
#define RECVREQ_RNDVRESTART_ACKED 0x04 #define RECVREQ_RNDVRESTART_ACKED 0x04
/* BFO FAILOVER CODE - end */ #endif
BEGIN_C_DECLS BEGIN_C_DECLS
struct mca_pml_bfo_recv_request_t { struct mca_pml_bfo_recv_request_t {
mca_pml_base_recv_request_t req_recv; mca_pml_base_recv_request_t req_recv;
ompi_ptr_t remote_req_send; ompi_ptr_t remote_req_send;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
int32_t req_msgseq; /* PML sequence number */ int32_t req_msgseq; /* PML sequence number */
int32_t req_events; /* number of outstanding events on request */ int32_t req_events; /* number of outstanding events on request */
int32_t req_restartseq; /* sequence number of restarted request */ int32_t req_restartseq; /* sequence number of restarted request */
int32_t req_errstate; /* state of request if in error */ int32_t req_errstate; /* state of request if in error */
/* BFO FAILOVER CODE - end */ #endif
int32_t req_lock; int32_t req_lock;
size_t req_pipeline_depth; size_t req_pipeline_depth;
size_t req_bytes_received; /**< amount of data transferred into the user buffer */ size_t req_bytes_received; /**< amount of data transferred into the user buffer */
@ -169,10 +168,10 @@ recv_request_pml_complete(mca_pml_bfo_recv_request_t *recvreq)
} }
} }
recvreq->req_rdma_cnt = 0; recvreq->req_rdma_cnt = 0;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
/* Initialize to a value that we indicate it is invalid */ /* Reset to a value that to indicate it is invalid. */
recvreq->req_msgseq = 42; recvreq->req_msgseq = recvreq->req_msgseq - 100;
/* BFO FAILOVER CODE - end */ #endif
OPAL_THREAD_LOCK(&ompi_request_lock); OPAL_THREAD_LOCK(&ompi_request_lock);
if(true == recvreq->req_recv.req_base.req_free_called) { if(true == recvreq->req_recv.req_base.req_free_called) {
@ -201,7 +200,11 @@ recv_request_pml_complete_check(mca_pml_bfo_recv_request_t *recvreq)
#endif #endif
if(recvreq->req_match_received && if(recvreq->req_match_received &&
recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed && recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed &&
#ifdef PML_BFO
(0 == recvreq->req_events) && lock_recv_request(recvreq)) { (0 == recvreq->req_events) && lock_recv_request(recvreq)) {
#else
lock_recv_request(recvreq)) {
#endif
recv_request_pml_complete(recvreq); recv_request_pml_complete(recvreq);
return true; return true;
} }
@ -236,9 +239,9 @@ static inline void recv_req_matched(mca_pml_bfo_recv_request_t *req,
req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src; req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src;
req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag; req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag;
req->req_match_received = true; req->req_match_received = true;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
req->req_msgseq = hdr->hdr_seq; req->req_msgseq = hdr->hdr_seq;
/* BFO FAILOVER CODE - end */ #endif
#if OPAL_HAVE_THREAD_SUPPORT #if OPAL_HAVE_THREAD_SUPPORT
opal_atomic_wmb(); opal_atomic_wmb();
#endif #endif

Просмотреть файл

@ -31,9 +31,9 @@
#include "pml_bfo_sendreq.h" #include "pml_bfo_sendreq.h"
#include "pml_bfo_rdmafrag.h" #include "pml_bfo_rdmafrag.h"
#include "pml_bfo_recvreq.h" #include "pml_bfo_recvreq.h"
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
#include "pml_bfo_failover.h" #include "pml_bfo_failover.h"
/* BFO FAILOVER CODE - end */ #endif
#include "ompi/mca/bml/base/base.h" #include "ompi/mca/bml/base/base.h"
#include "ompi/memchecker.h" #include "ompi/memchecker.h"
@ -183,12 +183,18 @@ mca_pml_bfo_match_completion_free( struct mca_btl_base_module_t* btl,
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
mca_pml_bfo_repost_match_fragment(des); mca_pml_bfo_repost_match_fragment(des);
return; return;
/* BFO FAILOVER CODE - end */ #else
/* TSW - FIX */
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
#endif
} }
#ifdef PML_BFO
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "MATCH"); MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "MATCH");
#endif
mca_pml_bfo_match_completion_free_request( bml_btl, sendreq ); mca_pml_bfo_match_completion_free_request( bml_btl, sendreq );
} }
@ -229,13 +235,20 @@ mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl,
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
MCA_PML_BFO_ERROR_ON_RNDV_COMPLETION(sendreq, des); #ifdef PML_BFO
if (true == mca_pml_bfo_rndv_completion_status_error(des, sendreq))
return;
#else
/* TSW - FIX */
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
#endif
} }
#ifdef PML_BFO
/* BFO FAILOVER CODE - begin */
sendreq->req_events--; sendreq->req_events--;
MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RNDV_COMPLETION(sendreq, status, btl); MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl,
/* BFO FAILOVER CODE - end */ MCA_PML_BFO_HDR_TYPE_RNDV, "RNDV");
#endif
/* count bytes of user data actually delivered. As the rndv completion only /* count bytes of user data actually delivered. As the rndv completion only
* happens in one thread, the increase of the req_bytes_delivered does not * happens in one thread, the increase of the req_bytes_delivered does not
@ -246,7 +259,9 @@ mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl,
sizeof(mca_pml_bfo_rendezvous_hdr_t), sizeof(mca_pml_bfo_rendezvous_hdr_t),
req_bytes_delivered ); req_bytes_delivered );
#ifdef PML_BFO
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "RNDV"); MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "RNDV");
#endif
mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered ); mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
} }
@ -264,8 +279,9 @@ mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
size_t req_bytes_delivered = 0; size_t req_bytes_delivered = 0;
#ifdef PML_BFO
MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RGET_COMPLETION(sendreq, btl, des); MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des);
#endif
/* count bytes of user data actually delivered and check for request completion */ /* count bytes of user data actually delivered and check for request completion */
MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt, MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt,
@ -274,8 +290,12 @@ mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
send_request_pml_complete_check(sendreq); send_request_pml_complete_check(sendreq);
/* free the descriptor */ /* free the descriptor */
#ifdef PML_BFO
btl->btl_free(btl, des); btl->btl_free(btl, des);
MCA_PML_BFO_CHECK_SENDREQ_RDMA_BML_BTL(bml_btl, btl, sendreq, "RGET"); MCA_PML_BFO_CHECK_SENDREQ_RDMA_BML_BTL(bml_btl, btl, sendreq, "RGET");
#else
mca_bml_base_free(bml_btl, des);
#endif
MCA_PML_BFO_PROGRESS_PENDING(bml_btl); MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
} }
@ -292,15 +312,15 @@ mca_pml_bfo_send_ctl_completion( mca_btl_base_module_t* btl,
{ {
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
if(OPAL_LIKELY(OMPI_SUCCESS == status)) { mca_pml_bfo_send_ctl_completion_status_error(des);
/* check for pending requests */ return;
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "ACK or RGET");
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
} else {
MCA_PML_BFO_ERROR_ON_SEND_CTL_COMPLETION(sendreq, des);
} }
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, des->des_cbdata, "RGET");
#endif
/* check for pending requests */
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
} }
/** /**
@ -317,15 +337,19 @@ mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
size_t req_bytes_delivered = 0; size_t req_bytes_delivered = 0;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
sendreq->req_events--; sendreq->req_events--;
/* BFO FAILOVER CODE - end */ #endif
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
sendreq->req_error++; sendreq->req_error++;
/* BFO FAILOVER CODE - end */ #else
/* TSW - FIX */
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
#endif
} }
/* count bytes of user data actually delivered */ /* count bytes of user data actually delivered */
@ -337,52 +361,23 @@ mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl,
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
/* note we check error after bytes delivered computation in case frag made it */ MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl,
if( OPAL_UNLIKELY(sendreq->req_error)) { MCA_PML_BFO_HDR_TYPE_FRAG, "FRAG");
opal_output_verbose(30, mca_pml_bfo_output, #endif
"FRAG: completion: sendreq has error, outstanding events=%d, "
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, (void *)sendreq,
sendreq->req_recv.pval,
status, sendreq->req_send.req_base.req_peer);
if (0 == sendreq->req_events) {
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
MCA_PML_BFO_HDR_TYPE_FRAG,
status, btl);
}
return;
}
/* BFO FAILOVER CODE - end */
if(send_request_pml_complete_check(sendreq) == false) { if(send_request_pml_complete_check(sendreq) == false) {
mca_pml_bfo_send_request_schedule(sendreq); mca_pml_bfo_send_request_schedule(sendreq);
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if( OPAL_UNLIKELY(sendreq->req_error)) { MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl,
/* This situation can happen if the scheduling function MCA_PML_BFO_HDR_TYPE_FRAG,
* determined that a BTL was removed from underneath us "FRAG (BTL removal)");
* and therefore marked the request in error. In that #endif
* case, the scheduling of fragments can no longer proceed
* properly. Therefore, if no outstanding events, initiate
* the restart dance. */
opal_output_verbose(30, mca_pml_bfo_output,
"FRAG: completion: BTL has been removed, outstanding events=%d, "
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, (void *)sendreq,
sendreq->req_recv.pval,
status, sendreq->req_send.req_base.req_peer);
if (0 == sendreq->req_events) {
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
MCA_PML_BFO_HDR_TYPE_FRAG,
status, btl);
}
}
/* BFO FAILOVER CODE - end */
} }
/* check for pending requests */ /* check for pending requests */
#ifdef PML_BFO
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "FRAG"); MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "FRAG");
#endif
MCA_PML_BFO_PROGRESS_PENDING(bml_btl); MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
} }
@ -438,7 +433,9 @@ int mca_pml_bfo_send_request_start_buffered(
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq; hdr->hdr_rndv.hdr_src_req.pval = sendreq;
#ifdef PML_BFO
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV(buffered)"); MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV(buffered)");
#endif
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV, bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc); sendreq->req_send.req_base.req_proc);
@ -487,11 +484,11 @@ int mca_pml_bfo_send_request_start_buffered(
if( OPAL_LIKELY( 1 == rc ) ) { if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered); mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered);
} }
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
sendreq->req_events++; sendreq->req_events++;
} }
/* BFO FAILOVER CODE - end */ #endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
mca_bml_base_free(bml_btl, des ); mca_bml_base_free(bml_btl, des );
@ -537,14 +534,13 @@ int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq,
MCA_PML_BFO_HDR_TYPE_MATCH, MCA_PML_BFO_HDR_TYPE_MATCH,
&des); &des);
if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) { if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
/* Needed for failover */ /* Needed in case of failover */
if (NULL != des) { if (NULL != des) {
des->des_cbfunc = mca_pml_bfo_match_completion_free; des->des_cbfunc = mca_pml_bfo_match_completion_free;
des->des_cbdata = sendreq->req_endpoint; des->des_cbdata = sendreq->req_endpoint;
} }
/* BFO FAILOVER CODE - end */ #endif
/* signal request completion */ /* signal request completion */
send_request_pml_complete(sendreq); send_request_pml_complete(sendreq);
@ -774,9 +770,9 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq; hdr->hdr_rndv.hdr_src_req.pval = sendreq;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RGET"); MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RGET");
/* BFO FAILOVER CODE - end */ #endif
hdr->hdr_rget.hdr_des.pval = src; hdr->hdr_rget.hdr_des.pval = src;
hdr->hdr_rget.hdr_seg_cnt = src->des_src_cnt; hdr->hdr_rget.hdr_seg_cnt = src->des_src_cnt;
@ -826,9 +822,9 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq; hdr->hdr_rndv.hdr_src_req.pval = sendreq;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV"); MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV");
/* BFO FAILOVER CODE - end */ #endif
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV, bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc); sendreq->req_send.req_base.req_proc);
@ -852,12 +848,12 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_LIKELY( 1 == rc ) && (true == need_local_cb)) { if( OPAL_LIKELY( 1 == rc ) && (true == need_local_cb)) {
mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, 0 ); mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, 0 );
} }
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if ((des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) && if ((des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) &&
(MCA_PML_BFO_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type)) { (MCA_PML_BFO_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type)) {
sendreq->req_events++; sendreq->req_events++;
} }
/* BFO FAILOVER CODE - end */ #endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
mca_bml_base_free(bml_btl, des); mca_bml_base_free(bml_btl, des);
@ -925,9 +921,9 @@ int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed; hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq; hdr->hdr_rndv.hdr_src_req.pval = sendreq;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV"); MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV");
/* BFO FAILOVER CODE - end */ #endif
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV, bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc); sendreq->req_send.req_base.req_proc);
@ -945,11 +941,11 @@ int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_LIKELY( 1 == rc ) ) { if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, size ); mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, size );
} }
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
sendreq->req_events++; sendreq->req_events++;
} }
/* BFO FAILOVER CODE - end */ #endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
mca_bml_base_free(bml_btl, des ); mca_bml_base_free(bml_btl, des );
@ -1062,7 +1058,7 @@ mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t* sendreq)
mca_bml_base_btl_t* bml_btl; mca_bml_base_btl_t* bml_btl;
assert(range->range_send_length != 0); assert(range->range_send_length != 0);
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
/* Failover code. If this is true, this means the request thinks we /* Failover code. If this is true, this means the request thinks we
* have more BTLs than there really are. This can happen because * have more BTLs than there really are. This can happen because
* a BTL was removed from the available list. In this case, we * a BTL was removed from the available list. In this case, we
@ -1072,7 +1068,7 @@ mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t* sendreq)
sendreq->req_error++; sendreq->req_error++;
return OMPI_ERROR; return OMPI_ERROR;
} }
/* BFO FAILOVER CODE - end */ #endif
if(prev_bytes_remaining == range->range_send_length) if(prev_bytes_remaining == range->range_send_length)
num_fail++; num_fail++;
@ -1181,11 +1177,11 @@ cannot_pack:
range = get_next_send_range(sendreq, range); range = get_next_send_range(sendreq, range);
prev_bytes_remaining = 0; prev_bytes_remaining = 0;
} }
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
sendreq->req_events++; sendreq->req_events++;
} }
/* BFO FAILOVER CODE - end */ #endif
} else { } else {
mca_bml_base_free(bml_btl,des); mca_bml_base_free(bml_btl,des);
} }
@ -1209,28 +1205,33 @@ static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata; mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata;
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)frag->rdma_req; mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
/* BFO FAILOVER CODE - begin */
sendreq->req_events--;
/* BFO FAILOVER CODE - end */
/* check completion status */ /* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
sendreq->req_error++; sendreq->req_error++;
/* BFO FAILOVER CODE - end */ #else
/* TSW - FIX */
ORTE_ERROR_LOG(status);
orte_errmgr.abort(-1, NULL);
#endif
} }
#ifdef PML_BFO
/* BFO FAILOVER CODE - begin */ sendreq->req_events--;
MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_PUT_COMPLETION(sendreq, status, btl); MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl);
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "RDMA write"); MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "RDMA write");
/* BFO FAILOVER CODE - end */ #endif
mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc, mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl, bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_des, frag->rdma_hdr.hdr_rdma.hdr_des,
des->order, 0, (uint16_t)sendreq->req_send.req_base.req_sequence, #ifdef PML_BFO
sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, des->order, 0, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_send.req_base.req_comm->c_my_rank); sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank);
#else
des->order, 0);
#endif
/* check for request completion */ /* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length); OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
@ -1275,9 +1276,13 @@ int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag )
/* tell receiver to unregister memory */ /* tell receiver to unregister memory */
mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc, mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des, bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
MCA_BTL_NO_ORDER, 1, (uint16_t)sendreq->req_send.req_base.req_sequence, #ifdef PML_BFO
sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid, MCA_BTL_NO_ORDER, 1, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_send.req_base.req_comm->c_my_rank); sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank);
#else
MCA_BTL_NO_ORDER, 1);
#endif
/* send fragment by copy in/out */ /* send fragment by copy in/out */
mca_pml_bfo_send_request_copy_in_out(sendreq, mca_pml_bfo_send_request_copy_in_out(sendreq,
@ -1313,14 +1318,11 @@ int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag )
orte_errmgr.abort(-1, NULL); orte_errmgr.abort(-1, NULL);
} }
} }
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
mca_pml_bfo_send_request_t *sendreq = ((mca_pml_bfo_send_request_t*)frag->rdma_req)->req_events++;
(mca_pml_bfo_send_request_t*)frag->rdma_req;
sendreq->req_events++;
} }
/* BFO FAILOVER CODE - end */ #endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -1341,18 +1343,20 @@ void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq,
size_t i, size = 0; size_t i, size = 0;
if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_TYPE_ACK) { if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_TYPE_ACK) {
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
/* Handle the failover case where a RNDV request may /* Handle the failover case where a RNDV request may
* have turned into a RGET and therefore the state * have turned into a RGET and therefore the state
* is not being tracked. */ * is not being tracked. */
if (sendreq->req_state != 0) { if (sendreq->req_state != 0) {
OPAL_THREAD_ADD32(&sendreq->req_state, -1); OPAL_THREAD_ADD32(&sendreq->req_state, -1);
} }
/* BFO FAILOVER CODE - end */ #else
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
#endif
} }
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
sendreq->req_recv = hdr->hdr_dst_req; /* only needed once, but it is OK */ sendreq->req_recv = hdr->hdr_dst_req; /* only needed once, but it is OK */
/* BFO FAILOVER CODE - end */ #endif
MCA_PML_BFO_RDMA_FRAG_ALLOC(frag, rc); MCA_PML_BFO_RDMA_FRAG_ALLOC(frag, rc);
@ -1380,21 +1384,10 @@ void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq,
} }
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl); frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
frag->rdma_btl = btl; frag->rdma_btl = btl;
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl);
opal_output(0, "[%s:%d] invalid bml for rdma put", __FILE__, __LINE__); #endif
MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
sendreq->req_error++;
if (0 == sendreq->req_events) {
opal_output(0, "[%s:%d] Issuing rndvrestartnotify", __FILE__, __LINE__);
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
MCA_PML_BFO_HDR_TYPE_PUT,
OMPI_ERROR, btl);
}
return;
}
/* BFO FAILOVER CODE - end */
frag->rdma_hdr.hdr_rdma = *hdr; frag->rdma_hdr.hdr_rdma = *hdr;
frag->rdma_req = sendreq; frag->rdma_req = sendreq;
frag->rdma_ep = bml_endpoint; frag->rdma_ep = bml_endpoint;

Просмотреть файл

@ -42,12 +42,12 @@ struct mca_pml_bfo_send_request_t {
mca_pml_base_send_request_t req_send; mca_pml_base_send_request_t req_send;
mca_bml_base_endpoint_t* req_endpoint; mca_bml_base_endpoint_t* req_endpoint;
ompi_ptr_t req_recv; ompi_ptr_t req_recv;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
int32_t req_events; /* number of outstanding events on request */ int32_t req_events; /* number of outstanding events on request */
int32_t req_restartseq; /* sequence number of restarted request */ int32_t req_restartseq; /* sequence number of restarted request */
int32_t req_restart; /* state of restarted request */ int32_t req_restart; /* state of restarted request */
int32_t req_error; /* non-zero when error has occurred on request */ int32_t req_error; /* non-zero when error has occurred on request */
/* BFO FAILOVER CODE - end */ #endif
int32_t req_state; int32_t req_state;
int32_t req_lock; int32_t req_lock;
bool req_throttle_sends; bool req_throttle_sends;
@ -249,7 +249,7 @@ send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq)
MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true); MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
} }
sendreq->req_send.req_base.req_pml_complete = true; sendreq->req_send.req_base.req_pml_complete = true;
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
assert(0 == sendreq->req_events); assert(0 == sendreq->req_events);
sendreq->req_restartseq = 0; sendreq->req_restartseq = 0;
/* Since sequence numbers increase monotonically and /* Since sequence numbers increase monotonically and
@ -258,7 +258,7 @@ send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq)
* as that is not within the valid range. */ * as that is not within the valid range. */
sendreq->req_send.req_base.req_sequence = sendreq->req_send.req_base.req_sequence =
sendreq->req_send.req_base.req_sequence - 10; sendreq->req_send.req_base.req_sequence - 10;
/* BFO FAILOVER CODE - end */ #endif
if(sendreq->req_send.req_base.req_free_called) { if(sendreq->req_send.req_base.req_free_called) {
MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq); MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq);
@ -437,12 +437,12 @@ mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq )
sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE; sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32( sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32(
&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1); &comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
/* BFO FAILOVER CODE - begin */ #ifdef PML_BFO
sendreq->req_restartseq = 0; /* counts up restarts */ sendreq->req_restartseq = 0; /* counts up restarts */
sendreq->req_restart = 0; /* reset in case we restart again */ sendreq->req_restart = 0; /* reset in case we restart again */
sendreq->req_error = 0; /* clear error state */ sendreq->req_error = 0; /* clear error state */
sendreq->req_events = 0; /* clear events, probably 0 anyways */ sendreq->req_events = 0; /* clear events, probably 0 anyways */
/* BFO FAILOVER CODE - end */ #endif
MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base ); MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );