1
1

Some BTLs (e.g TCP) can report put/get completion before data actually

hits the buffer on the other side. For this kind of BTLs we need to send
FIN through the same BTL, PUT was performed with so network will handle
ordering for us. If we will use another BTL, receiver can get FIN before
data will hit the buffer and complete request prematurely. We mark such
problematic BTLs with MCA_BTL_FLAGS_FAKE_RDMA flag (this kind of RDMA
is really fake, because the real one guaranties that sender will see the
completion only after receiver's NIC confirmed that all the data was
received).

This commit was SVN r12732.
Этот коммит содержится в:
Gleb Natapov 2006-12-03 10:12:09 +00:00
родитель 39c930b160
Коммит 30ca7457b4
6 изменённых файлов: 41 добавлений и 13 удалений

Просмотреть файл

@ -152,6 +152,9 @@ typedef uint8_t mca_btl_base_tag_t;
#define MCA_BTL_FLAGS_NEED_ACK 0x10
#define MCA_BTL_FLAGS_NEED_CSUM 0x20
/* btl can report put/get completion before data hits the other side */
#define MCA_BTL_FLAGS_FAKE_RDMA 0x40
/* Default exclusivity levels */
#define MCA_BTL_EXCLUSIVITY_HIGH 64*1024 /* internal loopback */
#define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */

Просмотреть файл

@ -224,7 +224,8 @@ int mca_btl_tcp_component_open(void)
mca_btl_tcp_param_register_int("flags", MCA_BTL_FLAGS_PUT |
MCA_BTL_FLAGS_SEND_INPLACE |
MCA_BTL_FLAGS_NEED_CSUM |
MCA_BTL_FLAGS_NEED_ACK );
MCA_BTL_FLAGS_NEED_ACK |
MCA_BTL_FLAGS_FAKE_RDMA);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -281,15 +281,20 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
int32_t i, rc, s = (int32_t)opal_list_get_size(&mca_pml_ob1.pckt_pending);
for(i = 0; i < s; i++) {
mca_bml_base_btl_t *send_dst;
mca_bml_base_btl_t *send_dst = NULL;
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
pckt = (mca_pml_ob1_pckt_pending_t*)
opal_list_remove_first(&mca_pml_ob1.pckt_pending);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
if(NULL == pckt)
break;
send_dst = mca_bml_base_btl_array_find(
&pckt->proc->proc_bml->btl_eager, bml_btl->btl);
if(pckt->bml_btl != NULL &&
pckt->bml_btl->btl == bml_btl->btl) {
send_dst = pckt->bml_btl;
} else {
send_dst = mca_bml_base_btl_array_find(
&pckt->proc->proc_bml->btl_eager, bml_btl->btl);
}
if(NULL == send_dst) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.pckt_pending,
@ -320,7 +325,7 @@ void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl)
MCA_PML_OB1_PCKT_PENDING_RETURN(pckt);
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
MCA_PML_OB1_ADD_FIN_TO_PENDING(pckt->proc,
pckt->hdr.hdr_fin.hdr_des.pval);
pckt->hdr.hdr_fin.hdr_des.pval, pckt->bml_btl);
return;
}
break;

Просмотреть файл

@ -237,6 +237,7 @@ struct mca_pml_ob1_pckt_pending_t {
ompi_free_list_item_t super;
ompi_proc_t* proc;
mca_pml_ob1_hdr_t hdr;
struct mca_bml_base_btl_t *bml_btl;
};
typedef struct mca_pml_ob1_pckt_pending_t mca_pml_ob1_pckt_pending_t;
OBJ_CLASS_DECLARATION(mca_pml_ob1_pckt_pending_t);
@ -255,7 +256,7 @@ do { \
(ompi_free_list_item_t*)pckt); \
} while(0)
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D) \
#define MCA_PML_OB1_ADD_FIN_TO_PENDING(P, D, B) \
do { \
mca_pml_ob1_pckt_pending_t *_pckt; \
int _rc; \
@ -264,6 +265,7 @@ do { \
_pckt->hdr.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_FIN; \
_pckt->hdr.hdr_fin.hdr_des.pval = (D); \
_pckt->proc = (P); \
_pckt->bml_btl = (B); \
OPAL_THREAD_LOCK(&mca_pml_ob1.lock); \
opal_list_append(&mca_pml_ob1.pckt_pending, \
(opal_list_item_t*)_pckt); \
@ -273,20 +275,37 @@ do { \
int mca_pml_ob1_send_fin_btl(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl,
void *hdr_des);
static inline int mca_pml_ob1_send_fin(ompi_proc_t* proc, void *hdr_des)
static inline int mca_pml_ob1_send_fin(ompi_proc_t* proc, void *hdr_des,
mca_bml_base_btl_t* bml_btl)
{
size_t i;
mca_bml_base_btl_t* bml_btl;
mca_bml_base_endpoint_t* endpoint =
(mca_bml_base_endpoint_t*)proc->proc_bml;
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) {
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
/* Some BTLs (e.g TCP) can report put/get completion before data actually
* hits the buffer on the other side. For this kind of BTLs we need to send
* FIN through the same BTL PUT was performed with so network will handle
* ordering for us. If we will use another BTL, receiver can get FIN before
* data will hit the buffer and complete request prematurely. We mark such
* problematic BTLs with MCA_BTL_FLAGS_FAKE_RDMA flag (this kind of RDMA
* is really fake, because the real one guaranties that sender will see the
* completion only after receiver's NIC confirmed that all the data was
* received)
*/
if(bml_btl->btl_flags & MCA_BTL_FLAGS_FAKE_RDMA) {
if(mca_pml_ob1_send_fin_btl(proc, bml_btl, hdr_des) == OMPI_SUCCESS)
return OMPI_SUCCESS;
} else {
for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager);
i++) {
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
if(mca_pml_ob1_send_fin_btl(proc, bml_btl, hdr_des) == OMPI_SUCCESS)
return OMPI_SUCCESS;
}
bml_btl = NULL;
}
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des);
MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl);
return OMPI_ERR_OUT_OF_RESOURCE;
}

Просмотреть файл

@ -330,7 +330,7 @@ static void mca_pml_ob1_rget_completion(
}
mca_pml_ob1_send_fin(recvreq->req_recv.req_base.req_proc,
frag->rdma_hdr.hdr_rget.hdr_des.pval);
frag->rdma_hdr.hdr_rget.hdr_des.pval, bml_btl);
/* is receive request complete */
if( OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, frag->rdma_length)

Просмотреть файл

@ -1018,7 +1018,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
}
mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
frag->rdma_hdr.hdr_rdma.hdr_des.pval);
frag->rdma_hdr.hdr_rdma.hdr_des.pval, bml_btl);
/* check for request completion */
if( OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, frag->rdma_length)