Some BTLs (e.g TCP) can report put/get completion before data actually
hits the buffer on the other side. For this kind of BTLs we need to send FIN through the same BTL, PUT was performed with so network will handle ordering for us. If we will use another BTL, receiver can get FIN before data will hit the buffer and complete request prematurely. We mark such problematic BTLs with MCA_BTL_FLAGS_FAKE_RDMA flag (this kind of RDMA is really fake, because the real one guaranties that sender will see the completion only after receiver's NIC confirmed that all the data was received). This commit was SVN r12732.
Этот коммит содержится в:
родитель
39c930b160
Коммит
30ca7457b4
@ -152,6 +152,9 @@ typedef uint8_t mca_btl_base_tag_t;
|
||||
#define MCA_BTL_FLAGS_NEED_ACK 0x10
|
||||
#define MCA_BTL_FLAGS_NEED_CSUM 0x20
|
||||
|
||||
/* btl can report put/get completion before data hits the other side */
|
||||
#define MCA_BTL_FLAGS_FAKE_RDMA 0x40
|
||||
|
||||
/* Default exclusivity levels */
|
||||
#define MCA_BTL_EXCLUSIVITY_HIGH 64*1024 /* internal loopback */
|
||||
#define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */
|
||||
|
@ -224,7 +224,8 @@ int mca_btl_tcp_component_open(void)
|
||||
mca_btl_tcp_param_register_int("flags", MCA_BTL_FLAGS_PUT |
|
||||
MCA_BTL_FLAGS_SEND_INPLACE |
|
||||
MCA_BTL_FLAGS_NEED_CSUM |
|
||||
MCA_BTL_FLAGS_NEED_ACK );
|
||||
MCA_BTL_FLAGS_NEED_ACK |
|
||||
MCA_BTL_FLAGS_FAKE_RDMA);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -281,15 +281,20 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_ob1.pckt_pending);
|
||||
|
||||
for(i = 0; i < s; i++) {
|
||||
mca_bml_base_btl_t *send_dst;
|
||||
mca_bml_base_btl_t *send_dst = NULL;
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
pckt = (mca_pml_ob1_pckt_pending_t*)
|
||||
opal_list_remove_first(&mca_pml_ob1.pckt_pending);
|
||||
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
||||
if(NULL == pckt)
|
||||
break;
|
||||
send_dst = mca_bml_base_btl_array_find(
|
||||
&pckt->proc->proc_bml->btl_eager, bml_btl->btl);
|
||||
if(pckt->bml_btl != NULL &&
|
||||
pckt->bml_btl->btl == bml_btl->btl) {
|
||||
send_dst = pckt->bml_btl;
|
||||
} else {
|
||||
send_dst = mca_bml_base_btl_array_find(
|
||||
&pckt->proc->proc_bml->btl_eager, bml_btl->btl);
|
||||
}
|
||||
if(NULL == send_dst) {
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
||||
opal_list_append(&mca_pml_ob1.pckt_pending,
|
||||
@ -320,7 +325,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
|
||||
MCA_PML_OB1_PCKT_PENDING_RETURN(pckt);
|
||||
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(pckt->proc,
|
||||
pckt->hdr.hdr_fin.hdr_des.pval);
|
||||
pckt->hdr.hdr_fin.hdr_des.pval, pckt->bml_btl);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
@ -237,6 +237,7 @@ struct mca_pml_ob1_pckt_pending_t {
|
||||
ompi_free_list_item_t super;
|
||||
ompi_proc_t* proc;
|
||||
mca_pml_ob1_hdr_t hdr;
|
||||
struct mca_bml_base_btl_t *bml_btl;
|
||||
};
|
||||
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
|
||||
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
|
||||
@ -255,7 +256,7 @@ do { \
|
||||
(ompi_free_list_item_t*)pckt); \
|
||||
} while(0)
|
||||
|
||||
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D) \
|
||||
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B) \
|
||||
do { \
|
||||
mca_pml_ob1_pckt_pending_t *_pckt; \
|
||||
int _rc; \
|
||||
@ -264,6 +265,7 @@ do { \
|
||||
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \
|
||||
_pckt->hdr.hdr_fin.hdr_des.pval = (D); \
|
||||
_pckt->proc = (P); \
|
||||
_pckt->bml_btl = (B); \
|
||||
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
|
||||
opal_list_append(&mca_pml_ob1.pckt_pending, \
|
||||
(opal_list_item_t*)_pckt); \
|
||||
@ -273,20 +275,37 @@ do { \
|
||||
|
||||
int mca_pml_ob1_send_fin_btl(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
|
||||
void *hdr_des);
|
||||
static inline int mca_pml_ob1_send_fin(ompi_proc_t* proc, void *hdr_des)
|
||||
static inline int mca_pml_ob1_send_fin(ompi_proc_t* proc, void *hdr_des,
|
||||
mca_bml_base_btl_t* bml_btl)
|
||||
{
|
||||
size_t i;
|
||||
mca_bml_base_btl_t* bml_btl;
|
||||
mca_bml_base_endpoint_t* endpoint =
|
||||
(mca_bml_base_endpoint_t*)proc->proc_bml;
|
||||
|
||||
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
/* Some BTLs (e.g TCP) can report put/get completion before data actually
|
||||
* hits the buffer on the other side. For this kind of BTLs we need to send
|
||||
* FIN through the same BTL PUT was performed with so network will handle
|
||||
* ordering for us. If we will use another BTL, receiver can get FIN before
|
||||
* data will hit the buffer and complete request prematurely. We mark such
|
||||
* problematic BTLs with MCA_BTL_FLAGS_FAKE_RDMA flag (this kind of RDMA
|
||||
* is really fake, because the real one guaranties that sender will see the
|
||||
* completion only after receiver's NIC confirmed that all the data was
|
||||
* received)
|
||||
*/
|
||||
if(bml_btl->btl_flags & MCA_BTL_FLAGS_FAKE_RDMA) {
|
||||
if(mca_pml_ob1_send_fin_btl(proc, bml_btl, hdr_des) == OMPI_SUCCESS)
|
||||
return OMPI_SUCCESS;
|
||||
} else {
|
||||
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager);
|
||||
i++) {
|
||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
||||
if(mca_pml_ob1_send_fin_btl(proc, bml_btl, hdr_des) == OMPI_SUCCESS)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
bml_btl = NULL;
|
||||
}
|
||||
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des);
|
||||
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl);
|
||||
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
@ -330,7 +330,7 @@ static void mca_pml_ob1_rget_completion(
|
||||
}
|
||||
|
||||
mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc,
|
||||
frag->rdma_hdr.hdr_rget.hdr_des.pval);
|
||||
frag->rdma_hdr.hdr_rget.hdr_des.pval, bml_btl);
|
||||
|
||||
/* is receive request complete */
|
||||
if( OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length)
|
||||
|
@ -1018,7 +1018,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
|
||||
}
|
||||
|
||||
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
|
||||
frag->rdma_hdr.hdr_rdma.hdr_des.pval);
|
||||
frag->rdma_hdr.hdr_rdma.hdr_des.pval, bml_btl);
|
||||
|
||||
/* check for request completion */
|
||||
if( OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length)
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user