1
1

Some more formatting fixes and code refactoring. All

these changes are in the bfo so this has no affect on ob1.

This commit was SVN r23815.
Этот коммит содержится в:
Rolf vandeVaart 2010-09-29 13:46:45 +00:00
родитель 19f9fc1146
Коммит 59e3fa8ed3
7 изменённых файлов: 210 добавлений и 164 удалений

Просмотреть файл

@ -411,33 +411,11 @@ int mca_pml_bfo_add_procs(ompi_proc_t** procs, size_t nprocs)
goto cleanup_and_return;
/* BFO FAILOVER CODE - begin */
/* The following four functions are utilized when failover
* support for openib is enabled. */
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY,
mca_pml_bfo_recv_frag_callback_rndvrestartnotify,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK,
mca_pml_bfo_recv_frag_callback_rndvrestartack,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK,
mca_pml_bfo_recv_frag_callback_rndvrestartnack,
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY,
mca_pml_bfo_recv_frag_callback_recverrnotify,
NULL );
rc = mca_pml_bfo_register_callbacks();
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
/* BFO FAILOVER CODE - end */
/* register error handlers */
rc = mca_bml.bml_register_error(mca_pml_bfo_error_handler);
if(OMPI_SUCCESS != rc)
@ -529,18 +507,21 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc,
return OMPI_ERR_OUT_OF_RESOURCE;
}
fin->des_cbfunc = mca_pml_bfo_fin_completion;
fin->des_cbdata = proc;
fin->des_cbdata = NULL;
/* fill in header */
hdr = (mca_pml_bfo_fin_hdr_t*)fin->des_src->seg_addr.pval;
hdr->hdr_match.hdr_common.hdr_flags = 0;
hdr->hdr_match.hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
hdr->hdr_common.hdr_flags = 0;
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des;
hdr->hdr_fail = status;
/* BFO FAILOVER CODE - begin */
fin->des_cbdata = proc;
hdr->hdr_match.hdr_seq = seq;
hdr->hdr_restartseq = restartseq;
hdr->hdr_match.hdr_ctx = ctx;
hdr->hdr_match.hdr_src = src;
hdr->hdr_match.hdr_common.hdr_flags = restartseq; /* use unused hdr_flags field */
/* BFO FAILOVER CODE - end */
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FIN, proc);
@ -608,10 +589,10 @@ void mca_pml_bfo_process_pending_packets(struct mca_btl_base_module_t* btl)
pckt->hdr.hdr_fin.hdr_des,
pckt->order,
pckt->hdr.hdr_fin.hdr_fail,
pckt->hdr.hdr_match.hdr_seq,
pckt->hdr.hdr_fin.hdr_restartseq,
pckt->hdr.hdr_match.hdr_ctx,
pckt->hdr.hdr_match.hdr_src);
pckt->hdr.hdr_fin.hdr_match.hdr_seq,
pckt->hdr.hdr_fin.hdr_match.hdr_common.hdr_flags,
pckt->hdr.hdr_fin.hdr_match.hdr_ctx,
pckt->hdr.hdr_fin.hdr_match.hdr_src);
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
return;
}

Просмотреть файл

@ -148,10 +148,12 @@ bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descripto
* if it was freed and not reused yet. */
if (NULL == rdma->des_cbdata) {
opal_output_verbose(20, mca_pml_bfo_output,
"FIN: received: dropping because not pointing to valid descriptor "
"PML=%d CTX=%d SRC=%d RQS=%d",
hdr->hdr_match.hdr_seq, hdr->hdr_match.hdr_ctx,
hdr->hdr_match.hdr_src, hdr->hdr_fin.hdr_restartseq);
"FIN: received: dropping because not pointing to valid descriptor "
"PML=%d CTX=%d SRC=%d RQS=%d",
hdr->hdr_fin.hdr_match.hdr_seq,
hdr->hdr_fin.hdr_match.hdr_ctx,
hdr->hdr_fin.hdr_match.hdr_src,
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags);
return true;
}
@ -165,65 +167,76 @@ bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descripto
* what fields to access. */
if (basereq->req_type == MCA_PML_REQUEST_RECV) {
mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)basereq;
if ((hdr->hdr_match.hdr_ctx != recvreq->req_recv.req_base.req_comm->c_contextid) ||
(hdr->hdr_match.hdr_src != recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) ||
(hdr->hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
if ((hdr->hdr_fin.hdr_match.hdr_ctx !=
recvreq->req_recv.req_base.req_comm->c_contextid) ||
(hdr->hdr_fin.hdr_match.hdr_src !=
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE) ||
(hdr->hdr_fin.hdr_match.hdr_seq != (uint16_t)recvreq->req_msgseq)) {
opal_output_verbose(5, mca_pml_bfo_output,
"FIN: received on receiver: dropping because no match "
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
"RQS:exp=%d,act=%d, dst_req=%p",
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
(uint16_t)recvreq->req_msgseq, hdr->hdr_fin.hdr_match.hdr_seq,
recvreq->req_recv.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx,
hdr->hdr_fin.hdr_match.hdr_ctx,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
hdr->hdr_match.hdr_src,
recvreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
hdr->hdr_fin.hdr_match.hdr_src,
recvreq->req_restartseq,
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
(void *)recvreq);
return true;
}
if (hdr->hdr_fin.hdr_restartseq != recvreq->req_restartseq) {
if (hdr->hdr_fin.hdr_match.hdr_common.hdr_flags != recvreq->req_restartseq) {
opal_output_verbose(5, mca_pml_bfo_output,
"FIN: received on receiver: dropping because old "
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
"RQS:exp=%d,act=%d, dst_req=%p",
(uint16_t)recvreq->req_msgseq, hdr->hdr_match.hdr_seq,
(uint16_t)recvreq->req_msgseq, hdr->hdr_fin.hdr_match.hdr_seq,
recvreq->req_recv.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx,
hdr->hdr_fin.hdr_match.hdr_ctx,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
hdr->hdr_match.hdr_src,
recvreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
hdr->hdr_fin.hdr_match.hdr_src,
recvreq->req_restartseq,
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
(void *)recvreq);
return true;
}
} else if (basereq->req_type == MCA_PML_REQUEST_SEND) {
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)basereq;
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
(hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_peer) ||
(hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) {
if ((hdr->hdr_fin.hdr_match.hdr_ctx !=
sendreq->req_send.req_base.req_comm->c_contextid) ||
(hdr->hdr_fin.hdr_match.hdr_src !=
sendreq->req_send.req_base.req_peer) ||
(hdr->hdr_fin.hdr_match.hdr_seq !=
(uint16_t)sendreq->req_send.req_base.req_sequence)) {
uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
opal_output_verbose(5, mca_pml_bfo_output,
"FIN: received on sender: dropping because no match "
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
"RQS:exp=%d,act=%d, dst_req=%p",
seq, hdr->hdr_match.hdr_seq,
seq, hdr->hdr_fin.hdr_match.hdr_seq,
sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx,
sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
sendreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
hdr->hdr_fin.hdr_match.hdr_ctx,
sendreq->req_send.req_base.req_peer,
hdr->hdr_fin.hdr_match.hdr_src,
sendreq->req_restartseq,
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
(void *)sendreq);
return true;
}
if (hdr->hdr_fin.hdr_restartseq != sendreq->req_restartseq) {
if (hdr->hdr_fin.hdr_match.hdr_common.hdr_flags != sendreq->req_restartseq) {
uint16_t seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
opal_output_verbose(5, mca_pml_bfo_output,
"FIN: received on sender: dropping because old "
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
"RQS:exp=%d,act=%d, dst_req=%p",
seq, hdr->hdr_match.hdr_seq,
seq, hdr->hdr_fin.hdr_match.hdr_seq,
sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx,
sendreq->req_send.req_base.req_peer, hdr->hdr_match.hdr_src,
sendreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
hdr->hdr_fin.hdr_match.hdr_ctx,
sendreq->req_send.req_base.req_peer,
hdr->hdr_fin.hdr_match.hdr_src,
sendreq->req_restartseq,
hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
(void *)sendreq);
return true;
}
@ -236,8 +249,9 @@ bool mca_pml_bfo_is_duplicate_fin(mca_pml_bfo_hdr_t* hdr, mca_btl_base_descripto
opal_output_verbose(5, mca_pml_bfo_output,
"FIN: received: dropping because descriptor has been reused "
"PML=%d CTX=%d SRC=%d RQS=%d rdma->des_flags=%d",
hdr->hdr_match.hdr_seq, hdr->hdr_match.hdr_ctx,
hdr->hdr_match.hdr_src, hdr->hdr_fin.hdr_restartseq, rdma->des_flags);
hdr->hdr_fin.hdr_match.hdr_seq, hdr->hdr_fin.hdr_match.hdr_ctx,
hdr->hdr_fin.hdr_match.hdr_src, hdr->hdr_fin.hdr_match.hdr_common.hdr_flags,
rdma->des_flags);
return true;
}
}
@ -281,7 +295,8 @@ void mca_pml_bfo_repost_fin(struct mca_btl_base_descriptor_t* des) {
/* Reconstruct the fin for sending on the other BTL */
mca_pml_bfo_send_fin(proc, bml_btl,
hdr->hdr_des, MCA_BTL_NO_ORDER,
hdr->hdr_fail, hdr->hdr_match.hdr_seq, hdr->hdr_restartseq,
hdr->hdr_fail, hdr->hdr_match.hdr_seq,
hdr->hdr_match.hdr_common.hdr_flags,
hdr->hdr_match.hdr_ctx, hdr->hdr_match.hdr_src);
return;
}
@ -1886,3 +1901,38 @@ void mca_pml_bfo_check_recv_ctl_completion_status(mca_btl_base_module_t* btl,
break;
}
}
/**
* Register four functions to handle extra PML message types that
* are utilized when a failover occurs.
*/
int mca_pml_bfo_register_callbacks(void) {
int rc;
/* The following four functions are utilized when failover
* support for openib is enabled. */
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNOTIFY,
mca_pml_bfo_recv_frag_callback_rndvrestartnotify,
NULL );
if(OMPI_SUCCESS != rc)
return rc;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTACK,
mca_pml_bfo_recv_frag_callback_rndvrestartack,
NULL );
if(OMPI_SUCCESS != rc)
return rc;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDVRESTARTNACK,
mca_pml_bfo_recv_frag_callback_rndvrestartnack,
NULL );
if(OMPI_SUCCESS != rc)
return rc;
rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RECVERRNOTIFY,
mca_pml_bfo_recv_frag_callback_recverrnotify,
NULL );
if(OMPI_SUCCESS != rc)
return rc;
return rc;
}

Просмотреть файл

@ -74,7 +74,7 @@ extern void mca_pml_bfo_map_out( mca_btl_base_module_t *btl,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
int mca_pml_bfo_register_callbacks(void);
/**
@ -99,7 +99,98 @@ extern void mca_pml_bfo_recv_frag_callback_recverrnotify( mca_btl_base_module_t
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata );
/**
* A bunch of macros to help isolate failover code from regular ob1 code.
*/
/* Drop any ACK fragments if request is in error state. Do not want
* to initiate any more activity. */
#define MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq) \
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"ACK: received: dropping because request in error, " \
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
(sendreq)->req_restartseq, \
(void *)(sendreq), (sendreq)->req_recv.pval, \
(sendreq)->req_send.req_base.req_peer); \
return; \
}
/* Drop any FRAG fragments if request is in error state. Do not want
* to initiate any more activity. */
#define MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq) \
if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"FRAG: received: dropping because request in error, " \
"PML=%d, src_req=%p, dst_req=%p, peer=%d, offset=%d", \
(uint16_t)(recvreq)->req_msgseq, \
(recvreq)->remote_req_send.pval, \
(void *)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE, \
(int)hdr->hdr_frag.hdr_frag_offset); \
return; \
}
/* Drop any PUT fragments if request is in error state. Do not want
* to initiate any more activity. */
#define MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq) \
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"PUT: received: dropping because request in error, " \
"PML=%d, src_req=%p, dst_req=%p, peer=%d", \
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
(void *)(sendreq), (sendreq)->req_recv.pval, \
(sendreq)->req_send.req_base.req_peer); \
return; \
}
/**
* Macros for pml_bfo_recvreq.c file.
*/
/* This can happen if a FIN message arrives after the request was
* marked in error. So, just drop the message. Note that the status
* field is not being checked. That is because the status field is the
* value returned in the FIN hdr.hdr_fail field and may be used for
* other things. Note that we allow the various fields to be updated
* in case this actually completes the request and the sending side
* thinks it is done. */
#define MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq) \
if( OPAL_UNLIKELY((recvreq)->req_errstate)) { \
opal_output_verbose(20, mca_pml_bfo_output, \
"FIN: received on broken request, skipping, " \
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
(recvreq)->remote_req_send.pval, (void *)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
/* Even though in error, it still might complete. */ \
recv_request_pml_complete_check(recvreq); \
return; \
}
#define MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq) \
if ((recvreq)->req_errstate) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RDMA read: completion failed, error already seen, " \
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
(unsigned long)(recvreq)->remote_req_send.pval, \
(unsigned long)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
return; \
} else { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RDMA read: completion failed, sending RECVERRNOTIFY to " \
"sender, PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d", \
(recvreq)->req_msgseq, (recvreq)->req_restartseq, \
(unsigned long)(recvreq)->remote_req_send.pval, \
(unsigned long)(recvreq), \
(recvreq)->req_recv.req_base.req_ompi.req_status.MPI_SOURCE); \
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, status); \
}
END_C_DECLS

Просмотреть файл

@ -340,24 +340,17 @@ do { \
*/
struct mca_pml_bfo_fin_hdr_t {
mca_pml_bfo_common_hdr_t hdr_common; /**< common attributes */
/* BFO FAILOVER CODE - begin */
mca_pml_bfo_match_hdr_t hdr_match; /**< match info - needed for failover */
uint8_t hdr_restartseq; /**< restart sequence - failover use only */
/* BFO FAILOVER CODE - end */
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
uint8_t hdr_padding[7];
#endif
uint32_t hdr_fail; /**< RDMA operation failed */
ompi_ptr_t hdr_des; /**< completed descriptor */
};
typedef struct mca_pml_bfo_fin_hdr_t mca_pml_bfo_fin_hdr_t;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG
#define MCA_PML_BFO_FIN_HDR_FILL(h) \
do { \
(h).hdr_padding[0] = 0; \
(h).hdr_padding[1] = 0; \
} while (0)
#define MCA_PML_BFO_FIN_HDR_FILL(h)
#else
#define MCA_PML_BFO_FIN_HDR_FILL(h)
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT && OPAL_ENABLE_DEBUG */
@ -365,12 +358,13 @@ do { \
#define MCA_PML_BFO_FIN_HDR_NTOH(h) \
do { \
MCA_PML_BFO_COMMON_HDR_NTOH((h).hdr_common); \
MCA_PML_BFO_MATCH_HDR_NTOH((h).hdr_match); \
} while (0)
#define MCA_PML_BFO_FIN_HDR_HTON(h) \
do { \
MCA_PML_BFO_COMMON_HDR_HTON((h).hdr_common); \
MCA_PML_BFO_FIN_HDR_FILL(h); \
MCA_PML_BFO_MATCH_HDR_HTON((h).hdr_match); \
} while (0)
/* BFO FAILOVER CODE - begin */

Просмотреть файл

@ -307,18 +307,7 @@ void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl,
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
/* BFO FAILOVER CODE - begin */
/* Drop any fragments if request is in error state. Do not want
* to initiate any more activity. */
if( OPAL_UNLIKELY(sendreq->req_error)) {
opal_output_verbose(20, mca_pml_bfo_output,
"ACK: received: dropping because request in error, "
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d",
(uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq,
(void *)sendreq, sendreq->req_recv.pval,
sendreq->req_send.req_base.req_peer);
return;
}
MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq)
/* BFO FAILOVER CODE - end */
/* if the request should be delivered entirely by copy in/out
@ -357,33 +346,21 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_recv_request_t* recvreq;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
return;
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG);
recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_bfo_hdr_t* hdr = (mca_pml_bfo_hdr_t*)segments->seg_addr.pval;
mca_pml_bfo_recv_request_t* recvreq;
if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_bfo_common_hdr_t)) ) {
return;
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG);
recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
/* BFO FAILOVER CODE - begin */
/* Drop any fragments if request is in error state. Do not want
* to initiate any more activity. */
if( OPAL_UNLIKELY(recvreq->req_errstate)) {
opal_output_verbose(20, mca_pml_bfo_output,
"FRAG: received: dropping because request in error, "
"PML=%d, src_req=%p, dst_req=%p, peer=%d, offset=%d",
(uint16_t)recvreq->req_msgseq,
recvreq->remote_req_send.pval,
(void *)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE,
(int)hdr->hdr_frag.hdr_frag_offset);
return;
}
MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq)
/* BFO FAILOVER CODE - end */
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
return;
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
return;
}
@ -402,17 +379,7 @@ void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl,
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_PUT);
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
/* BFO FAILOVER CODE - begin */
/* Drop any fragments if request is in error state. Do not want
* to initiate any more activity. */
if( OPAL_UNLIKELY(sendreq->req_error)) {
opal_output_verbose(20, mca_pml_bfo_output,
"PUT: received: dropping because request in error, "
"PML=%d, src_req=%p, dst_req=%p, peer=%d",
(uint16_t)sendreq->req_send.req_base.req_sequence,
(void *)sendreq, sendreq->req_recv.pval,
sendreq->req_send.req_base.req_peer);
return;
}
MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq)
/* BFO FAILOVER CODE - end */
mca_pml_bfo_send_request_put(sendreq,btl,&hdr->hdr_rdma);

Просмотреть файл

@ -196,26 +196,9 @@ static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl,
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
btl->btl_free(btl, des);
/* BFO FAILOVER CODE - begin */
/* This can happen if a FIN message arrives after the request was
* marked in error. So, just drop the message. Note that the
* status field is not being checked. That is because the status
* field is the value returned in the FIN hdr.hdr_fail field and
* may be used for other things. Note that we allow the various
* fields to be updated in case this actually completes the
* request and the sending side thinks it is done. */
if( OPAL_UNLIKELY(recvreq->req_errstate)) {
opal_output_verbose(20, mca_pml_bfo_output,
"FIN: received on broken request, skipping, "
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d",
recvreq->req_msgseq, recvreq->req_restartseq,
(unsigned long)recvreq->remote_req_send.pval,
(unsigned long)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
/* Even though in error, it still might complete. */
recv_request_pml_complete_check(recvreq);
return;
}
MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq)
/* BFO FAILOVER CODE - end */
/* check completion status */
@ -368,27 +351,7 @@ static void mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* BFO FAILOVER CODE - begin */
/* Record the error and send RECVERRNOTIFY if necessary. */
if (recvreq->req_errstate) {
opal_output_verbose(30, mca_pml_bfo_output,
"RDMA read: completion failed, error already seen, "
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d",
recvreq->req_msgseq, recvreq->req_restartseq,
(unsigned long)recvreq->remote_req_send.pval,
(unsigned long)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
return;
} else {
opal_output_verbose(30, mca_pml_bfo_output,
"RDMA read: completion failed, sending RECVERRNOTIFY to sender, "
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, peer=%d",
recvreq->req_msgseq, recvreq->req_restartseq,
(unsigned long)recvreq->remote_req_send.pval,
(unsigned long)recvreq,
recvreq->req_recv.req_base.req_ompi.req_status.MPI_SOURCE);
mca_pml_bfo_recv_request_recverrnotify(recvreq, MCA_PML_BFO_HDR_TYPE_RGET, status);
}
MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq)
}
/* BFO FAILOVER CODE - end */
/* BFO FAILOVER CODE - begin */

Просмотреть файл

@ -379,7 +379,7 @@ mca_pml_bfo_send_ctl_completion( mca_btl_base_module_t* btl,
hdr->hdr_match.hdr_ctx,
sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src,
sendreq->req_restartseq, hdr->hdr_fin.hdr_restartseq,
sendreq->req_restartseq, hdr->hdr_rndv.hdr_restartseq,
(void *)sendreq);
return;
}