1
1

Some more refactoring in the BFO PML. Getting it

as close to OB1 PML as possible.

This commit was SVN r23920.
Этот коммит содержится в:
Rolf vandeVaart 2010-10-22 18:13:35 +00:00
родитель 1766bf271a
Коммит 148ed00dd1
10 изменённых файлов: 383 добавлений и 310 удалений

Просмотреть файл

@ -48,9 +48,9 @@
#include "pml_bfo_sendreq.h"
#include "pml_bfo_recvreq.h"
#include "pml_bfo_rdmafrag.h"
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
#include "pml_bfo_failover.h"
/* BFO FAILOVER CODE - end */
#endif
mca_pml_bfo_t mca_pml_bfo = {
{
@ -409,13 +409,12 @@ int mca_pml_bfo_add_procs(ompi_proc_t** procs, size_t nprocs)
NULL );
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
rc = mca_pml_bfo_register_callbacks();
if(OMPI_SUCCESS != rc)
goto cleanup_and_return;
/* BFO FAILOVER CODE - end */
#endif
/* register error handlers */
rc = mca_bml.bml_register_error(mca_pml_bfo_error_handler);
if(OMPI_SUCCESS != rc)
@ -472,14 +471,13 @@ static void mca_pml_bfo_fin_completion( mca_btl_base_module_t* btl,
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
mca_pml_bfo_repost_fin(des);
return;
}
MCA_PML_BFO_CHECK_EAGER_BML_BTL_ON_FIN_COMPLETION(bml_btl, btl, des);
/* BFO FAILOVER CODE - end */
#endif
/* check for pending requests */
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
}
@ -494,12 +492,14 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc,
mca_bml_base_btl_t* bml_btl,
ompi_ptr_t hdr_des,
uint8_t order,
#ifdef PML_BFO
uint32_t status,
/* BFO FAILOVER CODE - begin */
uint16_t seq,
uint8_t restartseq,
uint16_t ctx, uint32_t src)
/* BFO FAILOVER CODE - end */
#else
uint32_t status )
#endif
{
mca_btl_base_descriptor_t* fin;
mca_pml_bfo_fin_hdr_t* hdr;
@ -521,13 +521,13 @@ int mca_pml_bfo_send_fin( ompi_proc_t* proc,
hdr->hdr_common.hdr_type = MCA_PML_BFO_HDR_TYPE_FIN;
hdr->hdr_des = hdr_des;
hdr->hdr_fail = status;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
fin->des_cbdata = proc;
hdr->hdr_match.hdr_seq = seq;
hdr->hdr_match.hdr_ctx = ctx;
hdr->hdr_match.hdr_src = src;
hdr->hdr_match.hdr_common.hdr_flags = restartseq; /* use unused hdr_flags field */
/* BFO FAILOVER CODE - end */
#endif
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_FIN, proc);
@ -594,11 +594,15 @@ void mca_pml_bfo_process_pending_packets(mca_bml_base_btl_t* bml_btl)
rc = mca_pml_bfo_send_fin(pckt->proc, send_dst,
pckt->hdr.hdr_fin.hdr_des,
pckt->order,
#ifdef PML_BFO
pckt->hdr.hdr_fin.hdr_fail,
pckt->hdr.hdr_fin.hdr_match.hdr_seq,
pckt->hdr.hdr_fin.hdr_match.hdr_common.hdr_flags,
pckt->hdr.hdr_fin.hdr_match.hdr_ctx,
pckt->hdr.hdr_fin.hdr_match.hdr_src);
#else
pckt->hdr.hdr_fin.hdr_fail);
#endif
if( OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == OPAL_SOS_GET_ERROR_CODE(rc)) ) {
return;
}
@ -640,14 +644,13 @@ void mca_pml_bfo_process_pending_rdma(void)
void mca_pml_bfo_error_handler(
struct mca_btl_base_module_t* btl, int32_t flags,
ompi_proc_t* errproc, char* btlinfo ) {
/* BFO FAILOVER CODE - begin */
/* If we get a non-fatal error, try to failover */
#ifdef PML_BFO
if (flags & MCA_BTL_ERROR_FLAGS_NONFATAL) {
mca_pml_bfo_failover_error_handler(btl, flags, errproc, btlinfo);
/* BFO FAILOVER CODE - end */
} else {
orte_errmgr.abort(-1, NULL);
return;
}
#endif
orte_errmgr.abort(-1, NULL);
}
#if OPAL_ENABLE_FT_CR == 0

Просмотреть файл

@ -23,6 +23,7 @@
#ifndef MCA_PML_BFO_H
#define MCA_PML_BFO_H
#define PML_BFO 1
#include "ompi_config.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/request/request.h"
@ -225,8 +226,12 @@ do { \
int mca_pml_bfo_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
#ifdef PML_BFO
ompi_ptr_t hdr_des, uint8_t order, uint32_t status,
uint16_t seq, uint8_t reqseq, uint16_t ctx, uint32_t src);
#else
ompi_ptr_t hdr_des, uint8_t order, uint32_t status);
#endif
/* This function tries to resend FIN/ACK packets from pckt_pending queue.
* Packets are added to the queue when sending of FIN or ACK is failed due to

Просмотреть файл

@ -2103,3 +2103,102 @@ void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
}
}
}
/**
* The completion event for the RNDV message has returned with an
* error. We know that the send request we are looking at is valid
* because it cannot be completed until the sendreq->req_state value
* reaches 0. And for the sendreq->req_state to reach 0, the
* completion event on the RNDV message must occur. So, we do not
* bother checking whether the send request is valid, because we know
* it is, but we put a few asserts in for good measure. We then check
* a few fields in the request to decide what to do. If the
* sendreq->req_error is set, that means that something has happend
* already to the request and we do not want to restart it.
* Presumably, we may have received a RECVERRNOTIFY message from the
* receiver. We also check the sendreq->req_acked field to see if it
* has been acked. If it has, then again we do not restart everything
* because obviously the RNDV message has made it to the other side.
*/
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
mca_pml_bfo_send_request_t* sendreq)
{
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_ctx ==
(sendreq)->req_send.req_base.req_comm->c_contextid);
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_src ==
(sendreq)->req_send.req_base.req_comm->c_my_rank);
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_seq ==
(uint16_t)(sendreq)->req_send.req_base.req_sequence);
if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) {
(sendreq)->req_events--;
/* Assume RNDV did not make it, so restart from the beginning. */
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDV);
return true;
}
return false;
}
/**
* Check to see if an error has occurred on this send request. If it has
* and there are no outstanding events, then we can start the restart dance.
*/
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendreq,
int status,
mca_btl_base_module_t* btl,
int type,
char *description)
{
opal_output_verbose(30, mca_pml_bfo_output,
"%s: completion: sendreq has error, outstanding events=%d, "
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
description,
sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, (void *)sendreq,
sendreq->req_recv.pval,
status, sendreq->req_send.req_base.req_peer);
if (0 == sendreq->req_events) {
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
type, status, btl);
}
}
/* If we get an error on the RGET message, then first make sure that
* header matches the send request that we are pointing to. This is
* necessary, because even though the sending side got an error, the
* RGET may have made it to the receiving side and the message transfer
* may have completed. This would then mean the send request has been
* completed and perhaps in use by another communication. So there is
* no need to restart this request. Therefore, ensure that we are
* looking at the same request that the header thinks we are looking
* at. If not, then there is nothing else to be done. */
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des)
{
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
mca_pml_bfo_hdr_t* hdr = des->des_src->seg_addr.pval;
switch (hdr->hdr_common.hdr_type) {
case MCA_PML_BFO_HDR_TYPE_RGET:
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) ||
(hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_comm->c_my_rank) ||
(hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) {
opal_output_verbose(30, mca_pml_bfo_output,
"RGET: completion event: dropping because no valid request "
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d "
"RQS:exp=%d,act=%d, dst_req=%p",
(uint16_t)sendreq->req_send.req_base.req_sequence,
hdr->hdr_match.hdr_seq,
sendreq->req_send.req_base.req_comm->c_contextid,
hdr->hdr_match.hdr_ctx,
sendreq->req_send.req_base.req_comm->c_my_rank,
hdr->hdr_match.hdr_src,
sendreq->req_restartseq, hdr->hdr_rndv.hdr_restartseq,
(void *)sendreq);
return;
}
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RGET);
return;
default:
opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)",
__FILE__, __LINE__, hdr->hdr_common.hdr_type);
orte_errmgr.abort(-1, NULL);
}
}

Просмотреть файл

@ -105,6 +105,16 @@ void mca_pml_bfo_find_recvreq_rdma_bml_btl(mca_bml_base_btl_t** bml_btl,
mca_pml_bfo_recv_request_t* recvreq,
char* type);
bool mca_pml_bfo_rndv_completion_status_error(struct mca_btl_base_descriptor_t* des,
mca_pml_bfo_send_request_t* sendreq);
void mca_pml_bfo_send_ctl_completion_status_error(struct mca_btl_base_descriptor_t* des);
void mca_pml_bfo_completion_sendreq_has_error(mca_pml_bfo_send_request_t* sendreq,
int status,
mca_btl_base_module_t* btl,
int type,
char *description);
/**
* Four new callbacks for the four new message types.
*/
@ -243,57 +253,33 @@ extern void mca_pml_bfo_recv_frag_callback_recverrnotify( mca_btl_base_module_t
* Macros for pml_bfo_sendreq.c file.
*/
/* The completion event for the RNDV message has returned with an
* error. We know that the send request we are looking at is valid
* because it cannot be completed until the sendreq->req_state value
* reaches 0. And for the sendreq->req_state to reach 0, the
* completion event on the RNDV message must occur. So, we do not
* bother checking whether the send request is valid, because we know
* it is, but we put a few asserts in for good measure. We then check
* a few fields in the request to decide what to do. If the
* sendreq->req_error is set, that means that something has happend
* already to the request and we do not want to restart it.
* Presumably, we may have received a RECVERRNOTIFY message from the
* receiver. We also check the sendreq->req_acked field to see if it
* has been acked. If it has, then again we do not restart everything
* because obviously the RNDV message has made it to the other side. */
#define MCA_PML_BFO_ERROR_ON_RNDV_COMPLETION(sendreq, des) \
do { \
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_ctx == \
(sendreq)->req_send.req_base.req_comm->c_contextid); \
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_src == \
(sendreq)->req_send.req_base.req_comm->c_my_rank); \
assert(((mca_pml_bfo_hdr_t*)((des)->des_src->seg_addr.pval))->hdr_match.hdr_seq == \
(uint16_t)(sendreq)->req_send.req_base.req_sequence); \
if ((!(sendreq)->req_error) && (NULL == (sendreq)->req_recv.pval)) { \
(sendreq)->req_events--; \
/* Assume RNDV did not make it, so restart from the beginning. */ \
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RNDV); \
return; \
} \
} while (0)
/* Now check the error state. This request can be in error if the
* RNDV message made it over, but the receiver got an error trying to
* send the ACK back and therefore sent a RECVERRNOTIFY message. In
* that case, we want to start the restart dance as the receiver has
* matched this message already. Only restart if there are no
* outstanding events on send request. */
#define MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RNDV_COMPLETION(sendreq, status, btl) \
if ((sendreq)->req_error) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RNDV: completion: sendreq has error, outstanding events=%d, " \
"PML=%d, RQS=%d, src_req=%lx, dst_req=%lx, status=%d, peer=%d", \
(sendreq)->req_events, \
(uint16_t)(sendreq)->req_send.req_base.req_sequence, \
(sendreq)->req_restartseq, (unsigned long)(sendreq), \
(unsigned long)(sendreq)->req_recv.pval, \
status, (sendreq)->req_send.req_base.req_peer); \
if (0 == (sendreq)->req_events) { \
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \
MCA_PML_BFO_HDR_TYPE_RNDV, \
status, btl); \
} \
#define MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
if( OPAL_UNLIKELY ((sendreq)->req_error)) { \
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
btl, type, description); \
return; \
}
/**
* This macro is called within the frag completion function in two
* places. It is called to see if any errors occur prior to the
* completion event on the frag. It is then called a second time
* after the scheduling routine is called as the scheduling routine
* may have detected that a BTL that was cached on the request had
* been removed and therefore marked the request in error. In that
* case, the scheduling of fragments can no longer proceed properly,
* and if there are no outstanding events, iniated the restart dance.
*/
#define MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl, type, description) \
if( OPAL_UNLIKELY((sendreq)->req_error)) { \
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, \
btl, type, description); \
return; \
}
@ -302,7 +288,7 @@ do {
* field is not checked here. That is because that is the value
* returned in the FIN hdr.hdr_fail field and may be used for other
* things. */
#define MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RGET_COMPLETION(sendreq, btl, des) \
#define MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des) \
if( OPAL_UNLIKELY(sendreq->req_error)) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"FIN: received on broken request, skipping, " \
@ -314,66 +300,15 @@ do {
return; \
}
/* If we get an error on the RGET message, then first make sure that
* header matches the send request that we are pointing to. This is
* necessary, because even though the sending side got an error, the
* RGET may have made it to the receiving side and the message transfer
* may have completed. This would then mean the send request has been
* completed and perhaps in use by another communication. So there is
* no need to restart this request. Therefore, ensure that we are
* looking at the same request that the header thinks we are looking
* at. If not, then there is nothing else to be done. */
#define MCA_PML_BFO_ERROR_ON_SEND_CTL_COMPLETION(sendreq, des) \
do { \
mca_pml_bfo_hdr_t* hdr = des->des_src->seg_addr.pval; \
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata; \
switch (hdr->hdr_common.hdr_type) { \
case MCA_PML_BFO_HDR_TYPE_RGET: \
if ((hdr->hdr_match.hdr_ctx != sendreq->req_send.req_base.req_comm->c_contextid) || \
(hdr->hdr_match.hdr_src != sendreq->req_send.req_base.req_comm->c_my_rank) || \
(hdr->hdr_match.hdr_seq != (uint16_t)sendreq->req_send.req_base.req_sequence)) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RGET: completion event: dropping because no valid request " \
"PML:exp=%d,act=%d CTX:exp=%d,act=%d SRC:exp=%d,act=%d " \
"RQS:exp=%d,act=%d, dst_req=%p", \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
hdr->hdr_match.hdr_seq, \
sendreq->req_send.req_base.req_comm->c_contextid, \
hdr->hdr_match.hdr_ctx, \
sendreq->req_send.req_base.req_comm->c_my_rank, \
hdr->hdr_match.hdr_src, \
sendreq->req_restartseq, hdr->hdr_rndv.hdr_restartseq, \
(void *)sendreq); \
return; \
} \
mca_pml_bfo_send_request_restart(sendreq, true, MCA_PML_BFO_HDR_TYPE_RGET); \
return; \
default: \
opal_output(0, "%s:%d FATAL ERROR, unknown header (hdr=%d)", \
__FILE__, __LINE__, hdr->hdr_common.hdr_type); \
orte_errmgr.abort(-1, NULL); \
} \
} while (0)
/* Check if there has been an error on the send request when we get
* a completion event on the RDMA write. */
#define MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_PUT_COMPLETION(sendreq, status, btl) \
if ( OPAL_UNLIKELY(sendreq->req_error)) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"RDMA write: completion: sendreq has error, outstanding events=%d," \
" PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d", \
sendreq->req_events, \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
sendreq->req_restartseq, (void *)sendreq, \
sendreq->req_recv.pval, \
status, sendreq->req_send.req_base.req_peer); \
if (0 == sendreq->req_events) { \
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \
MCA_PML_BFO_HDR_TYPE_PUT, \
status, btl); \
} \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
return; \
#define MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl) \
if ( OPAL_UNLIKELY(sendreq->req_error)) { \
mca_pml_bfo_completion_sendreq_has_error(sendreq, status, btl, \
MCA_PML_BFO_HDR_TYPE_PUT, "RDMA write"); \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
return; \
}
#define MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, type) \
@ -416,6 +351,28 @@ do {
mca_pml_bfo_update_eager_bml_btl_recv_ctl(&bml_btl, btl, des); \
}
#define MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl) \
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) { \
opal_output_verbose(30, mca_pml_bfo_output, \
"PUT received: no matching BTL to RDMA write to, oustanding " \
"events=%d, PML=%d, RQS=%d, src_req=%p, dst_req=%p, peer=%d", \
sendreq->req_events, \
(uint16_t)sendreq->req_send.req_base.req_sequence, \
sendreq->req_restartseq, (void *)sendreq, \
sendreq->req_recv.pval, sendreq->req_send.req_base.req_peer); \
MCA_PML_BFO_RDMA_FRAG_RETURN(frag); \
sendreq->req_error++; \
if (0 == sendreq->req_events) { \
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false, \
MCA_PML_BFO_HDR_TYPE_PUT, \
OMPI_ERROR, btl); \
} \
return; \
}
END_C_DECLS
#endif

Просмотреть файл

@ -36,7 +36,9 @@ typedef enum {
struct mca_pml_bfo_rdma_frag_t {
ompi_free_list_item_t super;
mca_bml_base_btl_t* rdma_bml;
#ifdef PML_BFO
mca_btl_base_module_t* rdma_btl;
#endif
mca_pml_bfo_hdr_t rdma_hdr;
mca_pml_bfo_rdma_state_t rdma_state;
size_t rdma_length;

Просмотреть файл

@ -41,9 +41,9 @@
#include "pml_bfo_recvreq.h"
#include "pml_bfo_sendreq.h"
#include "pml_bfo_hdr.h"
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
#include "pml_bfo_failover.h"
/* BFO FAILOVER CODE - end */
#endif
OBJ_CLASS_INSTANCE( mca_pml_bfo_buffer_t,
ompi_free_list_item_t,
@ -242,13 +242,13 @@ void mca_pml_bfo_recv_frag_callback_match(mca_btl_base_module_t* btl,
slow_path:
OPAL_THREAD_UNLOCK(&comm->matching_lock);
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
/* Check for duplicate messages. If message is duplicate, then just
* return as that essentially drops the message. */
if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
return;
}
/* BFO FAILOVER CODE - end */
#endif
mca_pml_bfo_recv_frag_match(btl, hdr, segments,
num_segments, MCA_PML_BFO_HDR_TYPE_MATCH);
}
@ -306,9 +306,9 @@ void mca_pml_bfo_recv_frag_callback_ack(mca_btl_base_module_t* btl,
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_ACK);
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_ACK_CALLBACK(sendreq)
/* BFO FAILOVER CODE - end */
#endif
/* if the request should be delivered entirely by copy in/out
* then throttle sends */
@ -352,9 +352,9 @@ void mca_pml_bfo_recv_frag_callback_frag(mca_btl_base_module_t* btl,
}
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FRAG);
recvreq = (mca_pml_bfo_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_FRAG_CALLBACK(recvreq)
/* BFO FAILOVER CODE - end */
#endif
mca_pml_bfo_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt);
return;
@ -375,9 +375,9 @@ void mca_pml_bfo_recv_frag_callback_put(mca_btl_base_module_t* btl,
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_PUT);
sendreq = (mca_pml_bfo_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_PUT_CALLBACK(sendreq)
/* BFO FAILOVER CODE - end */
#endif
mca_pml_bfo_send_request_put(sendreq,btl,&hdr->hdr_rdma);
return;
@ -398,11 +398,11 @@ void mca_pml_bfo_recv_frag_callback_fin(mca_btl_base_module_t* btl,
bfo_hdr_ntoh(hdr, MCA_PML_BFO_HDR_TYPE_FIN);
rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if (true == mca_pml_bfo_is_duplicate_fin(hdr, rdma, btl)) {
return;
}
/* BFO FAILOVER CODE - end */
#endif
rdma->des_cbfunc(btl, NULL, rdma,
hdr->hdr_fin.hdr_fail ? OMPI_ERROR : OMPI_SUCCESS);
@ -626,7 +626,7 @@ static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl,
* the fragment.
*/
OPAL_THREAD_LOCK(&comm->matching_lock);
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
/* In case of network failover, we may get a message telling us to
* restart. In that case, we already have a pointer to the receive
* request in the header itself. */
@ -635,8 +635,8 @@ static int mca_pml_bfo_recv_frag_match( mca_btl_base_module_t *btl,
if (NULL == match) {
return OMPI_SUCCESS;
}
/* BFO FAILOVER CODE - end */
} else {
#endif
/* get sequence number of next message that can be processed */
next_msg_seq_expected = (uint16_t)proc->expected_sequence;
@ -673,8 +673,9 @@ out_of_order_match:
/* release matching lock before processing fragment */
OPAL_THREAD_UNLOCK(&comm->matching_lock);
#ifdef PML_BFO
}
#endif
if(OPAL_LIKELY(match)) {
switch(type) {
case MCA_PML_BFO_HDR_TYPE_MATCH:
@ -716,14 +717,13 @@ wrong_seq:
* This message comes after the next expected, so it
* is ahead of sequence. Save it for later.
*/
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
/* Check for duplicate messages. If message is duplicate, then just
* return as that essentially drops the message. */
if (true == mca_pml_bfo_is_duplicate_msg(proc, hdr)) {
return OMPI_SUCCESS;
}
/* BFO FAILOVER CODE - end */
#endif
append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments,
num_segments, NULL);
OPAL_THREAD_UNLOCK(&comm->matching_lock);

Просмотреть файл

@ -33,9 +33,9 @@
#include "orte/mca/errmgr/errmgr.h"
#include "opal/util/arch.h"
#include "ompi/memchecker.h"
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
#include "pml_bfo_failover.h"
/* BFO FAILOVER CODE - end */
#endif
void mca_pml_bfo_recv_request_process_pending(void)
{
@ -170,13 +170,13 @@ static void mca_pml_bfo_recv_ctl_completion( mca_btl_base_module_t* btl,
{
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) {
mca_pml_bfo_check_recv_ctl_completion_status(btl, des, status);
}
MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL_RECV_CTL(bml_btl, btl, des);
/* BFO FAILOVER CODE - end */
#endif
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
}
@ -199,12 +199,13 @@ static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl,
}
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,-1);
#ifdef PML_BFO
btl->btl_free(btl, des);
/* BFO FAILOVER CODE - begin */
MCA_PML_BFO_ERROR_CHECK_ON_FIN_FOR_PUT(recvreq);
MCA_PML_BFO_CHECK_RECVREQ_EAGER_BML_BTL(bml_btl, btl, recvreq, "PUT");
/* BFO FAILOVER CODE - end */
#else
mca_bml_base_free(bml_btl, des);
#endif
/* check completion status */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, bytes_received);
@ -249,18 +250,18 @@ int mca_pml_bfo_recv_request_ack_send_btl(
/* initialize descriptor */
des->des_cbfunc = mca_pml_bfo_recv_ctl_completion;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
des->des_cbdata = (void *)proc;
/* BFO FAILOVER CODE - end */
#endif
rc = mca_bml_base_send(bml_btl, des, MCA_PML_BFO_HDR_TYPE_ACK);
if( OPAL_LIKELY( rc >= 0 ) ) {
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) &&
(des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
((mca_pml_bfo_recv_request_t *)hdr_dst_req)->req_events++;
}
/* BFO FAILOVER CODE - end */
#endif
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, des);
@ -346,29 +347,39 @@ static void mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata;
mca_pml_bfo_recv_request_t* recvreq = (mca_pml_bfo_recv_request_t*)frag->rdma_req;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if (btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) {
recvreq->req_events--;
assert(recvreq->req_events >= 0);
}
/* BFO FAILOVER CODE - end */
#endif
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
#ifdef PML_BFO
MCA_PML_BFO_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq);
#else
/* TSW - FIX */
ORTE_ERROR_LOG(status);
orte_errmgr.abort(-1, NULL);
#endif
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
MCA_PML_BFO_SECOND_ERROR_CHECK_ON_RDMA_READ_COMPLETION(recvreq, status, btl);
MCA_PML_BFO_CHECK_RECVREQ_RDMA_BML_BTL(bml_btl, btl, recvreq, "RDMA write");
/* BFO FAILOVER CODE - end */
#endif
mca_pml_bfo_send_fin(recvreq->req_recv.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rget.hdr_des,
des->order, 0, (uint16_t)recvreq->req_msgseq, recvreq->req_restartseq,
recvreq->req_recv.req_base.req_comm->c_contextid,
recvreq->req_recv.req_base.req_comm->c_my_rank);
#ifdef PML_BFO
des->order, 0, (uint16_t)recvreq->req_msgseq, recvreq->req_restartseq,
recvreq->req_recv.req_base.req_comm->c_contextid,
recvreq->req_recv.req_base.req_comm->c_my_rank);
#else
des->order, 0);
#endif
/* is receive request complete */
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length);
@ -432,12 +443,12 @@ int mca_pml_bfo_recv_request_get_frag( mca_pml_bfo_rdma_frag_t* frag )
orte_errmgr.abort(-1, NULL);
}
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if ((bml_btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) &&
(descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
recvreq->req_events++;
}
/* BFO FAILOVER CODE - end */
#endif
return OMPI_SUCCESS;
}
@ -519,9 +530,9 @@ void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq
0, bytes_received );
recvreq->req_recv.req_bytes_packed = hdr->hdr_rndv.hdr_msg_length;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req;
/* BFO FAILOVER CODE - end */
#endif
MCA_PML_BFO_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_rndv.hdr_match);
/* if receive buffer is not contiguous we can't just RDMA read into it, so
@ -556,9 +567,9 @@ void mca_pml_bfo_recv_request_progress_rget( mca_pml_bfo_recv_request_t* recvreq
size += hdr->hdr_segs[i].seg_len;
}
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
frag->rdma_btl = btl;
/* BFO FAILOVER CODE - end */
#endif
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
opal_output(0, "[%s:%d] invalid bml for rdma get", __FILE__, __LINE__);
@ -828,9 +839,9 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
continue;
}
ctl->des_cbfunc = mca_pml_bfo_recv_ctl_completion;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
ctl->des_cbdata = recvreq;
/* BFO FAILOVER CODE - end */
#endif
/* fill in rdma header */
hdr = (mca_pml_bfo_rdma_hdr_t*)ctl->des_src->seg_addr.pval;
@ -838,9 +849,9 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
hdr->hdr_common.hdr_flags =
(!recvreq->req_ack_sent) ? MCA_PML_BFO_HDR_TYPE_ACK : 0;
hdr->hdr_req = recvreq->remote_req_send;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
hdr->hdr_dst_req.pval = recvreq; /* only needed in the first put message */
/* BFO FAILOVER CODE - end */
#endif
hdr->hdr_des.pval = dst;
hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
hdr->hdr_seg_cnt = dst->des_dst_cnt;
@ -862,12 +873,12 @@ int mca_pml_bfo_recv_request_schedule_once( mca_pml_bfo_recv_request_t* recvreq,
/* send rdma request to peer */
rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_BFO_HDR_TYPE_PUT);
if( OPAL_LIKELY( rc >= 0 ) ) {
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if ((btl->btl_flags & MCA_BTL_FLAGS_FAILOVER_SUPPORT) &&
(ctl->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
recvreq->req_events++;
}
/* BFO FAILOVER CODE - end */
#endif
/* update request state */
recvreq->req_rdma_offset += size;
OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
@ -993,12 +1004,12 @@ void mca_pml_bfo_recv_req_start(mca_pml_bfo_recv_request_t *req)
req->req_bytes_received = 0;
req->req_bytes_expected = 0;
/* What about req_rdma_cnt ? */
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
req->req_rdma_cnt = 0;
req->req_events = 0;
req->req_restartseq = 0;
req->req_errstate = 0;
/* BFO FAILOVER CODE - end */
#endif
req->req_rdma_idx = 0;
req->req_pending = false;
req->req_ack_sent = false;

Просмотреть файл

@ -30,24 +30,23 @@
#include "ompi/mca/pml/bfo/pml_bfo_comm.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/pml/base/pml_base_recvreq.h"
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
#define RECVREQ_RECVERRSENT 0x01
#define RECVREQ_RNDVRESTART_RECVED 0x02
#define RECVREQ_RNDVRESTART_ACKED 0x04
/* BFO FAILOVER CODE - end */
#endif
BEGIN_C_DECLS
struct mca_pml_bfo_recv_request_t {
mca_pml_base_recv_request_t req_recv;
ompi_ptr_t remote_req_send;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
int32_t req_msgseq; /* PML sequence number */
int32_t req_events; /* number of outstanding events on request */
int32_t req_restartseq; /* sequence number of restarted request */
int32_t req_errstate; /* state of request if in error */
/* BFO FAILOVER CODE - end */
#endif
int32_t req_lock;
size_t req_pipeline_depth;
size_t req_bytes_received; /**< amount of data transferred into the user buffer */
@ -169,10 +168,10 @@ recv_request_pml_complete(mca_pml_bfo_recv_request_t *recvreq)
}
}
recvreq->req_rdma_cnt = 0;
/* BFO FAILOVER CODE - begin */
/* Initialize to a value that we indicate it is invalid */
recvreq->req_msgseq = 42;
/* BFO FAILOVER CODE - end */
#ifdef PML_BFO
/* Reset to a value that to indicate it is invalid. */
recvreq->req_msgseq = recvreq->req_msgseq - 100;
#endif
OPAL_THREAD_LOCK(&ompi_request_lock);
if(true == recvreq->req_recv.req_base.req_free_called) {
@ -201,7 +200,11 @@ recv_request_pml_complete_check(mca_pml_bfo_recv_request_t *recvreq)
#endif
if(recvreq->req_match_received &&
recvreq->req_bytes_received >= recvreq->req_recv.req_bytes_packed &&
#ifdef PML_BFO
(0 == recvreq->req_events) && lock_recv_request(recvreq)) {
#else
lock_recv_request(recvreq)) {
#endif
recv_request_pml_complete(recvreq);
return true;
}
@ -236,9 +239,9 @@ static inline void recv_req_matched(mca_pml_bfo_recv_request_t *req,
req->req_recv.req_base.req_ompi.req_status.MPI_SOURCE = hdr->hdr_src;
req->req_recv.req_base.req_ompi.req_status.MPI_TAG = hdr->hdr_tag;
req->req_match_received = true;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
req->req_msgseq = hdr->hdr_seq;
/* BFO FAILOVER CODE - end */
#endif
#if OPAL_HAVE_THREAD_SUPPORT
opal_atomic_wmb();
#endif

Просмотреть файл

@ -31,9 +31,9 @@
#include "pml_bfo_sendreq.h"
#include "pml_bfo_rdmafrag.h"
#include "pml_bfo_recvreq.h"
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
#include "pml_bfo_failover.h"
/* BFO FAILOVER CODE - end */
#endif
#include "ompi/mca/bml/base/base.h"
#include "ompi/memchecker.h"
@ -183,12 +183,18 @@ mca_pml_bfo_match_completion_free( struct mca_btl_base_module_t* btl,
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
mca_pml_bfo_repost_match_fragment(des);
return;
/* BFO FAILOVER CODE - end */
#else
/* TSW - FIX */
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
#endif
}
#ifdef PML_BFO
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "MATCH");
#endif
mca_pml_bfo_match_completion_free_request( bml_btl, sendreq );
}
@ -229,13 +235,20 @@ mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl,
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
MCA_PML_BFO_ERROR_ON_RNDV_COMPLETION(sendreq, des);
#ifdef PML_BFO
if (true == mca_pml_bfo_rndv_completion_status_error(des, sendreq))
return;
#else
/* TSW - FIX */
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
#endif
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
sendreq->req_events--;
MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RNDV_COMPLETION(sendreq, status, btl);
/* BFO FAILOVER CODE - end */
MCA_PML_BFO_RNDV_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl,
MCA_PML_BFO_HDR_TYPE_RNDV, "RNDV");
#endif
/* count bytes of user data actually delivered. As the rndv completion only
* happens in one thread, the increase of the req_bytes_delivered does not
@ -246,7 +259,9 @@ mca_pml_bfo_rndv_completion( mca_btl_base_module_t* btl,
sizeof(mca_pml_bfo_rendezvous_hdr_t),
req_bytes_delivered );
#ifdef PML_BFO
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "RNDV");
#endif
mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered );
}
@ -264,8 +279,9 @@ mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context;
size_t req_bytes_delivered = 0;
MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_RGET_COMPLETION(sendreq, btl, des);
#ifdef PML_BFO
MCA_PML_BFO_RGET_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, btl, des);
#endif
/* count bytes of user data actually delivered and check for request completion */
MCA_PML_BFO_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt,
@ -274,8 +290,12 @@ mca_pml_bfo_rget_completion( mca_btl_base_module_t* btl,
send_request_pml_complete_check(sendreq);
/* free the descriptor */
#ifdef PML_BFO
btl->btl_free(btl, des);
MCA_PML_BFO_CHECK_SENDREQ_RDMA_BML_BTL(bml_btl, btl, sendreq, "RGET");
#else
mca_bml_base_free(bml_btl, des);
#endif
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
}
@ -292,15 +312,15 @@ mca_pml_bfo_send_ctl_completion( mca_btl_base_module_t* btl,
{
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
/* BFO FAILOVER CODE - begin */
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
if(OPAL_LIKELY(OMPI_SUCCESS == status)) {
/* check for pending requests */
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "ACK or RGET");
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
} else {
MCA_PML_BFO_ERROR_ON_SEND_CTL_COMPLETION(sendreq, des);
#ifdef PML_BFO
if(OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
mca_pml_bfo_send_ctl_completion_status_error(des);
return;
}
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, des->des_cbdata, "RGET");
#endif
/* check for pending requests */
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
}
/**
@ -317,15 +337,19 @@ mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)des->des_cbdata;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
size_t req_bytes_delivered = 0;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
sendreq->req_events--;
/* BFO FAILOVER CODE - end */
#endif
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
sendreq->req_error++;
/* BFO FAILOVER CODE - end */
#else
/* TSW - FIX */
opal_output(0, "%s:%d FATAL", __FILE__, __LINE__);
orte_errmgr.abort(-1, NULL);
#endif
}
/* count bytes of user data actually delivered */
@ -337,52 +361,23 @@ mca_pml_bfo_frag_completion( mca_btl_base_module_t* btl,
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, -1);
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
/* BFO FAILOVER CODE - begin */
/* note we check error after bytes delivered computation in case frag made it */
if( OPAL_UNLIKELY(sendreq->req_error)) {
opal_output_verbose(30, mca_pml_bfo_output,
"FRAG: completion: sendreq has error, outstanding events=%d, "
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, (void *)sendreq,
sendreq->req_recv.pval,
status, sendreq->req_send.req_base.req_peer);
if (0 == sendreq->req_events) {
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
MCA_PML_BFO_HDR_TYPE_FRAG,
status, btl);
}
return;
}
/* BFO FAILOVER CODE - end */
#ifdef PML_BFO
MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl,
MCA_PML_BFO_HDR_TYPE_FRAG, "FRAG");
#endif
if(send_request_pml_complete_check(sendreq) == false) {
mca_pml_bfo_send_request_schedule(sendreq);
/* BFO FAILOVER CODE - begin */
if( OPAL_UNLIKELY(sendreq->req_error)) {
/* This situation can happen if the scheduling function
* determined that a BTL was removed from underneath us
* and therefore marked the request in error. In that
* case, the scheduling of fragments can no longer proceed
* properly. Therefore, if no outstanding events, initiate
* the restart dance. */
opal_output_verbose(30, mca_pml_bfo_output,
"FRAG: completion: BTL has been removed, outstanding events=%d, "
"PML=%d, RQS=%d, src_req=%p, dst_req=%p, status=%d, peer=%d",
sendreq->req_events, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, (void *)sendreq,
sendreq->req_recv.pval,
status, sendreq->req_send.req_base.req_peer);
if (0 == sendreq->req_events) {
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
MCA_PML_BFO_HDR_TYPE_FRAG,
status, btl);
}
}
/* BFO FAILOVER CODE - end */
#ifdef PML_BFO
MCA_PML_BFO_FRAG_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl,
MCA_PML_BFO_HDR_TYPE_FRAG,
"FRAG (BTL removal)");
#endif
}
/* check for pending requests */
#ifdef PML_BFO
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "FRAG");
#endif
MCA_PML_BFO_PROGRESS_PENDING(bml_btl);
}
@ -438,7 +433,9 @@ int mca_pml_bfo_send_request_start_buffered(
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
#ifdef PML_BFO
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV(buffered)");
#endif
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc);
@ -487,11 +484,11 @@ int mca_pml_bfo_send_request_start_buffered(
if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered);
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
sendreq->req_events++;
}
/* BFO FAILOVER CODE - end */
#endif
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, des );
@ -537,14 +534,13 @@ int mca_pml_bfo_send_request_start_copy( mca_pml_bfo_send_request_t* sendreq,
MCA_PML_BFO_HDR_TYPE_MATCH,
&des);
if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
/* BFO FAILOVER CODE - begin */
/* Needed for failover */
#ifdef PML_BFO
/* Needed in case of failover */
if (NULL != des) {
des->des_cbfunc = mca_pml_bfo_match_completion_free;
des->des_cbdata = sendreq->req_endpoint;
}
/* BFO FAILOVER CODE - end */
#endif
/* signal request completion */
send_request_pml_complete(sendreq);
@ -774,9 +770,9 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RGET");
/* BFO FAILOVER CODE - end */
#endif
hdr->hdr_rget.hdr_des.pval = src;
hdr->hdr_rget.hdr_seg_cnt = src->des_src_cnt;
@ -826,9 +822,9 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV");
/* BFO FAILOVER CODE - end */
#endif
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc);
@ -852,12 +848,12 @@ int mca_pml_bfo_send_request_start_rdma( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_LIKELY( 1 == rc ) && (true == need_local_cb)) {
mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, 0 );
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if ((des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) &&
(MCA_PML_BFO_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type)) {
sendreq->req_events++;
}
/* BFO FAILOVER CODE - end */
#endif
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, des);
@ -925,9 +921,9 @@ int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq,
hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence;
hdr->hdr_rndv.hdr_msg_length = sendreq->req_send.req_bytes_packed;
hdr->hdr_rndv.hdr_src_req.pval = sendreq;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
MCA_PML_BFO_CHECK_FOR_RNDV_RESTART(hdr, sendreq, "RNDV");
/* BFO FAILOVER CODE - end */
#endif
bfo_hdr_hton(hdr, MCA_PML_BFO_HDR_TYPE_RNDV,
sendreq->req_send.req_base.req_proc);
@ -945,11 +941,11 @@ int mca_pml_bfo_send_request_start_rndv( mca_pml_bfo_send_request_t* sendreq,
if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_bfo_rndv_completion_request( bml_btl, sendreq, size );
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
sendreq->req_events++;
}
/* BFO FAILOVER CODE - end */
#endif
return OMPI_SUCCESS;
}
mca_bml_base_free(bml_btl, des );
@ -1062,7 +1058,7 @@ mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t* sendreq)
mca_bml_base_btl_t* bml_btl;
assert(range->range_send_length != 0);
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
/* Failover code. If this is true, this means the request thinks we
* have more BTLs than there really are. This can happen because
* a BTL was removed from the available list. In this case, we
@ -1072,7 +1068,7 @@ mca_pml_bfo_send_request_schedule_once(mca_pml_bfo_send_request_t* sendreq)
sendreq->req_error++;
return OMPI_ERROR;
}
/* BFO FAILOVER CODE - end */
#endif
if(prev_bytes_remaining == range->range_send_length)
num_fail++;
@ -1181,11 +1177,11 @@ cannot_pack:
range = get_next_send_range(sendreq, range);
prev_bytes_remaining = 0;
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
sendreq->req_events++;
}
/* BFO FAILOVER CODE - end */
#endif
} else {
mca_bml_base_free(bml_btl,des);
}
@ -1209,28 +1205,33 @@ static void mca_pml_bfo_put_completion( mca_btl_base_module_t* btl,
mca_pml_bfo_rdma_frag_t* frag = (mca_pml_bfo_rdma_frag_t*)des->des_cbdata;
mca_pml_bfo_send_request_t* sendreq = (mca_pml_bfo_send_request_t*)frag->rdma_req;
mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context;
/* BFO FAILOVER CODE - begin */
sendreq->req_events--;
/* BFO FAILOVER CODE - end */
/* check completion status */
if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) {
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
sendreq->req_error++;
/* BFO FAILOVER CODE - end */
#else
/* TSW - FIX */
ORTE_ERROR_LOG(status);
orte_errmgr.abort(-1, NULL);
#endif
}
/* BFO FAILOVER CODE - begin */
MCA_PML_BFO_CHECK_SENDREQ_ERROR_ON_PUT_COMPLETION(sendreq, status, btl);
#ifdef PML_BFO
sendreq->req_events--;
MCA_PML_BFO_PUT_COMPLETION_SENDREQ_ERROR_CHECK(sendreq, status, btl);
MCA_PML_BFO_CHECK_SENDREQ_EAGER_BML_BTL(bml_btl, btl, sendreq, "RDMA write");
/* BFO FAILOVER CODE - end */
#endif
mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_des,
des->order, 0, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank);
#ifdef PML_BFO
des->order, 0, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank);
#else
des->order, 0);
#endif
/* check for request completion */
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length);
@ -1275,9 +1276,13 @@ int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag )
/* tell receiver to unregister memory */
mca_pml_bfo_send_fin(sendreq->req_send.req_base.req_proc,
bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
MCA_BTL_NO_ORDER, 1, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank);
#ifdef PML_BFO
MCA_BTL_NO_ORDER, 1, (uint16_t)sendreq->req_send.req_base.req_sequence,
sendreq->req_restartseq, sendreq->req_send.req_base.req_comm->c_contextid,
sendreq->req_send.req_base.req_comm->c_my_rank);
#else
MCA_BTL_NO_ORDER, 1);
#endif
/* send fragment by copy in/out */
mca_pml_bfo_send_request_copy_in_out(sendreq,
@ -1313,14 +1318,11 @@ int mca_pml_bfo_send_request_put_frag( mca_pml_bfo_rdma_frag_t* frag )
orte_errmgr.abort(-1, NULL);
}
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
mca_pml_bfo_send_request_t *sendreq =
(mca_pml_bfo_send_request_t*)frag->rdma_req;
sendreq->req_events++;
((mca_pml_bfo_send_request_t*)frag->rdma_req)->req_events++;
}
/* BFO FAILOVER CODE - end */
#endif
return OMPI_SUCCESS;
}
@ -1341,18 +1343,20 @@ void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq,
size_t i, size = 0;
if(hdr->hdr_common.hdr_flags & MCA_PML_BFO_HDR_TYPE_ACK) {
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
/* Handle the failover case where a RNDV request may
* have turned into a RGET and therefore the state
* is not being tracked. */
if (sendreq->req_state != 0) {
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
}
/* BFO FAILOVER CODE - end */
#else
OPAL_THREAD_ADD32(&sendreq->req_state, -1);
#endif
}
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
sendreq->req_recv = hdr->hdr_dst_req; /* only needed once, but it is OK */
/* BFO FAILOVER CODE - end */
#endif
MCA_PML_BFO_RDMA_FRAG_ALLOC(frag, rc);
@ -1380,21 +1384,10 @@ void mca_pml_bfo_send_request_put( mca_pml_bfo_send_request_t* sendreq,
}
frag->rdma_bml = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
frag->rdma_btl = btl;
if( OPAL_UNLIKELY(NULL == frag->rdma_bml) ) {
opal_output(0, "[%s:%d] invalid bml for rdma put", __FILE__, __LINE__);
MCA_PML_BFO_RDMA_FRAG_RETURN(frag);
sendreq->req_error++;
if (0 == sendreq->req_events) {
opal_output(0, "[%s:%d] Issuing rndvrestartnotify", __FILE__, __LINE__);
mca_pml_bfo_send_request_rndvrestartnotify(sendreq, false,
MCA_PML_BFO_HDR_TYPE_PUT,
OMPI_ERROR, btl);
}
return;
}
/* BFO FAILOVER CODE - end */
MCA_PML_BFO_CHECK_FOR_REMOVED_BML(sendreq, frag, btl);
#endif
frag->rdma_hdr.hdr_rdma = *hdr;
frag->rdma_req = sendreq;
frag->rdma_ep = bml_endpoint;

Просмотреть файл

@ -42,12 +42,12 @@ struct mca_pml_bfo_send_request_t {
mca_pml_base_send_request_t req_send;
mca_bml_base_endpoint_t* req_endpoint;
ompi_ptr_t req_recv;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
int32_t req_events; /* number of outstanding events on request */
int32_t req_restartseq; /* sequence number of restarted request */
int32_t req_restart; /* state of restarted request */
int32_t req_error; /* non-zero when error has occurred on request */
/* BFO FAILOVER CODE - end */
#endif
int32_t req_state;
int32_t req_lock;
bool req_throttle_sends;
@ -249,7 +249,7 @@ send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq)
MCA_PML_BFO_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
}
sendreq->req_send.req_base.req_pml_complete = true;
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
assert(0 == sendreq->req_events);
sendreq->req_restartseq = 0;
/* Since sequence numbers increase monotonically and
@ -258,7 +258,7 @@ send_request_pml_complete(mca_pml_bfo_send_request_t *sendreq)
* as that is not within the valid range. */
sendreq->req_send.req_base.req_sequence =
sendreq->req_send.req_base.req_sequence - 10;
/* BFO FAILOVER CODE - end */
#endif
if(sendreq->req_send.req_base.req_free_called) {
MCA_PML_BFO_SEND_REQUEST_RETURN(sendreq);
@ -437,12 +437,12 @@ mca_pml_bfo_send_request_start( mca_pml_bfo_send_request_t* sendreq )
sendreq->req_pending = MCA_PML_BFO_SEND_PENDING_NONE;
sendreq->req_send.req_base.req_sequence = OPAL_THREAD_ADD32(
&comm->procs[sendreq->req_send.req_base.req_peer].send_sequence,1);
/* BFO FAILOVER CODE - begin */
#ifdef PML_BFO
sendreq->req_restartseq = 0; /* counts up restarts */
sendreq->req_restart = 0; /* reset in case we restart again */
sendreq->req_error = 0; /* clear error state */
sendreq->req_events = 0; /* clear events, probably 0 anyways */
/* BFO FAILOVER CODE - end */
#endif
MCA_PML_BASE_SEND_START( &sendreq->req_send.req_base );