1
1

Porting of short message RDMA from openib BTL. Endpoint registers circular buffer and sends its address and rkey to the peer. Peer uses this buffer to eagerly RDMA small message into it. Endpoint polls the buffer for message arrival before checking HP/LP QPs. Set btl_mvapi_use_eager_rdma to 1 to enable it.

This commit was SVN r9474.
Этот коммит содержится в:
Gleb Natapov 2006-03-30 12:55:31 +00:00
родитель 99ba9bea10
Коммит ea11582191
7 изменённых файлов: 461 добавлений и 103 удалений

Просмотреть файл

@ -32,7 +32,8 @@ sources = \
btl_mvapi_frag.c \
btl_mvapi_frag.h \
btl_mvapi_proc.c \
btl_mvapi_proc.h
btl_mvapi_proc.h \
btl_mvapi_eager_rdma.h
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la

Просмотреть файл

@ -28,6 +28,7 @@
/* Open MPI includes */
#include "ompi/class/ompi_free_list.h"
#include "ompi/class/ompi_bitmap.h"
#include "orte/class/orte_pointer_array.h"
#include "opal/event/event.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
@ -122,7 +123,10 @@ struct mca_btl_mvapi_component_t {
uint32_t ib_service_level;
uint32_t ib_static_rate;
uint32_t ib_src_path_bits;
uint32_t use_eager_rdma;
uint32_t eager_rdma_threashold;
uint32_t eager_rdma_num;
uint32_t max_eager_rdma;
}; typedef struct mca_btl_mvapi_component_t mca_btl_mvapi_component_t;
extern mca_btl_mvapi_component_t mca_btl_mvapi_component;
@ -182,7 +186,10 @@ struct mca_btl_mvapi_module_t {
opal_list_t pending_frags_hp; /**< list of pending high priority frags */
opal_list_t pending_frags_lp; /**< list of pending low priority frags */
opal_mutex_t eager_rdma_lock;
size_t eager_rdma_frag_size; /**< length of eager frag */
orte_pointer_array_t *eager_rdma_buffers; /**< RDMA buffers to poll */
uint32_t eager_rdma_buffers_count; /**< number of RDMA buffers */
}; typedef struct mca_btl_mvapi_module_t mca_btl_mvapi_module_t;

Просмотреть файл

@ -40,6 +40,7 @@
#include "btl_mvapi.h"
#include "btl_mvapi_frag.h"
#include "btl_mvapi_endpoint.h"
#include "btl_mvapi_eager_rdma.h"
#include "ompi/mca/btl/base/base.h"
#include <vapi.h>
#include <vapi_common.h>
@ -197,6 +198,17 @@ int mca_btl_mvapi_component_open(void)
mca_btl_mvapi_param_register_int("srq_sd_max", "Maximum number of send descriptors posted per process",
8, &mca_btl_mvapi_component.srq_sd_max);
mca_btl_mvapi_param_register_int("use_eager_rdma", "user RDMA for eager messages",
0, &mca_btl_mvapi_component.use_eager_rdma);
if (mca_btl_mvapi_component.use_srq)
mca_btl_mvapi_component.use_eager_rdma = 0;
mca_btl_mvapi_param_register_int("eager_rdma_threashold", "Open rdma channel for eager messages after this number of messages received from peer (zero to disable)",
100, &mca_btl_mvapi_component.eager_rdma_threashold);
mca_btl_mvapi_param_register_int("max_eager_rdma", "Maximum number of eager RDMA connections",
16, (int*)&mca_btl_mvapi_component.max_eager_rdma);
mca_btl_mvapi_param_register_int("eager_rdma_num", "Number of RDMA buffers for eager messages",
16, (int*)&mca_btl_mvapi_component.eager_rdma_num);
mca_btl_mvapi_component.eager_rdma_num+=1;
mca_btl_mvapi_param_register_int ("exclusivity", "BTL exclusivity",
MCA_BTL_EXCLUSIVITY_DEFAULT, (int*) &mca_btl_mvapi_module.super.btl_exclusivity);
mca_btl_mvapi_param_register_int ("eager_limit", "eager send limit",
@ -276,11 +288,37 @@ static void mca_btl_mvapi_control(
/* dont return credits used for control messages */
mca_btl_mvapi_frag_t* frag = (mca_btl_mvapi_frag_t*)descriptor;
mca_btl_mvapi_endpoint_t* endpoint = frag->endpoint;
mca_btl_mvapi_control_header_t *ctl_hdr = frag->segment.seg_addr.pval;
mca_btl_mvapi_eager_rdma_header_t *rdma_hdr;
if(frag->size == mca_btl_mvapi_component.eager_limit) {
OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -1);
/* if not sent via rdma */
if (!MCA_BTL_MVAPI_RDMA_FRAG(frag) &&
ctl_hdr->type == MCA_BTL_MVAPI_CONTROL_NOOP)
OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -1);
} else {
OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -1);
}
switch (ctl_hdr->type) {
case MCA_BTL_MVAPI_CONTROL_NOOP:
break;
case MCA_BTL_MVAPI_CONTROL_RDMA:
rdma_hdr = (mca_btl_mvapi_eager_rdma_header_t*)ctl_hdr;
if (endpoint->eager_rdma_remote.base.pval) {
BTL_ERROR(("Got RDMA connect twise!\n"));
return;
}
endpoint->eager_rdma_remote.rkey = rdma_hdr->rkey;
endpoint->eager_rdma_remote.base.pval = rdma_hdr->rdma_start.pval;
endpoint->eager_rdma_remote.tokens =
mca_btl_mvapi_component.eager_rdma_num - 1;
break;
default:
BTL_ERROR(("Unknown message type sent by BTL\n"));
break;
}
}
@ -467,9 +505,13 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
length = sizeof(mca_btl_mvapi_frag_t) +
sizeof(mca_btl_mvapi_header_t) +
sizeof(mca_btl_mvapi_footer_t) +
mvapi_btl->super.btl_eager_limit+
2*MCA_BTL_IB_FRAG_ALIGN;
mvapi_btl->eager_rdma_frag_size =
length & ~(2 * MCA_BTL_IB_FRAG_ALIGN - 1);
ompi_free_list_init(&mvapi_btl->send_free_eager,
length,
OBJ_CLASS(mca_btl_mvapi_send_frag_eager_t),
@ -530,7 +572,13 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
mca_btl_mvapi_component.ib_free_list_inc,
mvapi_btl->super.btl_mpool);
orte_pointer_array_init(&mvapi_btl->eager_rdma_buffers,
mca_btl_mvapi_component.max_eager_rdma,
mca_btl_mvapi_component.max_eager_rdma,
0);
mvapi_btl->eager_rdma_buffers_count = 0;
OBJ_CONSTRUCT(&mvapi_btl->eager_rdma_lock, opal_mutex_t);
/* Initialize the rr_desc_post array for posting of rr*/
mvapi_btl->rr_desc_post = (VAPI_rr_desc_t*) malloc(
((mca_btl_mvapi_component.rd_num + mca_btl_mvapi_component.rd_rsv) * sizeof(VAPI_rr_desc_t)));
@ -546,6 +594,102 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
return btls;
}
int mca_btl_mvapi_handle_incoming_hp(
mca_btl_mvapi_module_t *mvapi_btl,
mca_btl_mvapi_endpoint_t *endpoint,
mca_btl_mvapi_frag_t *frag,
size_t byte_len)
{
/* advance the segment address past the header and subtract from the length..*/
frag->segment.seg_len = byte_len-
((unsigned char*) frag->segment.seg_addr.pval -
(unsigned char*) frag->hdr);
/* call registered callback */
mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag,
&frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata);
if (!MCA_BTL_MVAPI_RDMA_FRAG(frag)) {
OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_eager),
(opal_list_item_t*) frag);
} else {
mca_btl_mvapi_frag_t *tf;
OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
MCA_BTL_MVAPI_RDMA_MAKE_REMOTE(frag->ftr);
while (endpoint->eager_rdma_local.tail !=
endpoint->eager_rdma_local.head) {
tf = MCA_BTL_MVAPI_GET_LOCAL_RDMA_FRAG(endpoint,
endpoint->eager_rdma_local.tail);
if (MCA_BTL_MVAPI_RDMA_FRAG_LOCAL (tf))
break;
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, 1);
MCA_BTL_MVAPI_RDMA_NEXT_INDEX(endpoint->eager_rdma_local.tail);
}
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
}
if (!mca_btl_mvapi_component.use_srq) {
OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp, frag->hdr->credits);
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,
frag->hdr->rdma_credits);
}
if (mca_btl_mvapi_component.use_eager_rdma &&
!endpoint->eager_rdma_local.base.pval &&
mvapi_btl->eager_rdma_buffers_count <
mca_btl_mvapi_component.max_eager_rdma &&
OPAL_THREAD_ADD32(&endpoint->eager_recv_count, 1) ==
mca_btl_mvapi_component.eager_rdma_threashold)
mca_btl_mvapi_endpoint_connect_eager_rdma(endpoint);
/* repost receive descriptors */
#ifdef VAPI_FEATURE_SRQ
if(mca_btl_mvapi_component.use_srq) {
OPAL_THREAD_ADD32((int32_t*) &mvapi_btl->srd_posted_hp, -1);
MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 0);
} else {
#endif
if (!MCA_BTL_MVAPI_RDMA_FRAG(frag)) {
OPAL_THREAD_ADD32((int32_t*) &endpoint->rd_posted_hp, -1);
MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 0);
}
/* check to see if we need to progress any pending desciptors */
if(endpoint->sd_tokens_hp > 0 ||
endpoint->eager_rdma_remote.tokens > 0) {
while(!opal_list_is_empty(&endpoint->pending_frags_hp) &&
endpoint->sd_wqe_hp > 0 &&
(endpoint->sd_tokens_hp > 0 ||
endpoint->eager_rdma_remote.tokens > 0)) {
opal_list_item_t *frag_item;
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_frags_hp));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_mvapi_frag_t *) frag_item))
break;
if(OMPI_SUCCESS !=
mca_btl_mvapi_endpoint_send(frag->endpoint, frag)) {
BTL_ERROR(("error in posting pending send\n"));
break;
}
}
}
/* check to see if we need to return credits */
if((endpoint->rd_credits_hp >= mca_btl_mvapi_component.rd_win ||
endpoint->eager_rdma_local.credits >=
mca_btl_mvapi_component.rd_win) &&
OPAL_THREAD_ADD32(&endpoint->sd_credits_hp, 1) == 1) {
mca_btl_mvapi_endpoint_send_credits_hp(endpoint);
}
#ifdef VAPI_FEATURE_SRQ
}
#endif
return OMPI_SUCCESS;
}
/*
* IB component progress.
*/
@ -553,10 +697,10 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
int mca_btl_mvapi_component_progress( void )
{
uint32_t i;
int count = 0;
uint32_t i, j, c;
int count = 0, ret;
int32_t credits;
mca_btl_mvapi_frag_t* frag;
mca_btl_mvapi_endpoint_t* endpoint;
/* Poll for completions */
@ -564,7 +708,52 @@ int mca_btl_mvapi_component_progress( void )
VAPI_ret_t ret;
VAPI_wc_desc_t comp;
mca_btl_mvapi_module_t* mvapi_btl = &mca_btl_mvapi_component.mvapi_btls[i];
OPAL_THREAD_LOCK(&mvapi_btl->eager_rdma_lock);
c = mvapi_btl->eager_rdma_buffers_count;
OPAL_THREAD_UNLOCK(&mvapi_btl->eager_rdma_lock);
for(j = 0; j < c; j++) {
endpoint =
orte_pointer_array_get_item(mvapi_btl->eager_rdma_buffers, j);
if(!endpoint) /* shouldn't happen */
continue;
OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
frag = MCA_BTL_MVAPI_GET_LOCAL_RDMA_FRAG (endpoint,
endpoint->eager_rdma_local.head);
if (MCA_BTL_MVAPI_RDMA_FRAG_LOCAL (frag)) {
uint32_t size = MCA_BTL_MVAPI_RDMA_FRAG_GET_SIZE(frag->ftr);
#if OMPI_ENABLE_DEBUG
if (frag->ftr->seq != endpoint->eager_rdma_local.seq)
BTL_ERROR(("Eager RDMA wrong SEQ: received %d expected %d",
frag->ftr->seq,
endpoint->eager_rdma_local.seq));
endpoint->eager_rdma_local.seq++;
#endif
MCA_BTL_MVAPI_RDMA_NEXT_INDEX(endpoint->eager_rdma_local.head);
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
frag->hdr = (mca_btl_mvapi_header_t*)(((char*)frag->ftr) -
size + sizeof(mca_btl_mvapi_footer_t));
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) +
sizeof(mca_btl_mvapi_header_t);
ret = mca_btl_mvapi_handle_incoming_hp(mvapi_btl,
frag->endpoint, frag,
size - sizeof(mca_btl_mvapi_footer_t));
if (ret != MPI_SUCCESS)
return ret;
count++;
} else
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
}
if(count)
break;
/* we have two completion queues, one for "high" priority and one for "low".
* we will check the high priority and process them until there are none left.
* note that low priority messages are only processed one per progress call.
@ -584,6 +773,7 @@ int mca_btl_mvapi_component_progress( void )
BTL_ERROR(("Got an RDMA with Immediate data!, not supported!"));
return OMPI_ERROR;
case VAPI_CQE_SQ_RDMA_WRITE:
case VAPI_CQE_SQ_SEND_DATA :
/* Process a completed send */
@ -598,7 +788,7 @@ int mca_btl_mvapi_component_progress( void )
/* check to see if we need to progress any pending desciptors */
while (!opal_list_is_empty(&endpoint->pending_frags_hp) &&
endpoint->sd_wqe_hp > 0 && endpoint->sd_tokens_hp > 0) {
endpoint->sd_wqe_hp > 0 && (endpoint->sd_tokens_hp > 0 || endpoint->eager_rdma_remote.tokens > 0)) {
opal_list_item_t *frag_item;
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_frags_hp));
@ -613,7 +803,7 @@ int mca_btl_mvapi_component_progress( void )
if(!mca_btl_mvapi_component.use_srq) {
/* check to see if we need to return credits */
if( endpoint->rd_credits_hp >= mca_btl_mvapi_component.rd_win &&
if((endpoint->rd_credits_hp >= mca_btl_mvapi_component.rd_win || endpoint->eager_rdma_local.credits >= mca_btl_mvapi_component.rd_win) &&
OPAL_THREAD_ADD32(&endpoint->sd_credits_hp, 1) == 1) {
mca_btl_mvapi_endpoint_send_credits_hp(endpoint);
}
@ -633,58 +823,16 @@ int mca_btl_mvapi_component_progress( void )
break;
case VAPI_CQE_RQ_SEND_DATA:
/* process a RECV */
/* process a RECV */
frag = (mca_btl_mvapi_frag_t*) (unsigned long) comp.id;
endpoint = (mca_btl_mvapi_endpoint_t*) frag->endpoint;
credits = frag->hdr->credits;
/* advance the segment address past the header and subtract from the length..*/
frag->segment.seg_len = comp.byte_len-((unsigned char*) frag->segment.seg_addr.pval - (unsigned char*) frag->hdr);
/* call registered callback */
mvapi_btl->ib_reg[frag->hdr->tag].cbfunc(&mvapi_btl->super, frag->hdr->tag, &frag->base, mvapi_btl->ib_reg[frag->hdr->tag].cbdata);
OMPI_FREE_LIST_RETURN(&(mvapi_btl->recv_free_eager), (opal_list_item_t*) frag);
/* repost receive descriptors */
#ifdef VAPI_FEATURE_SRQ
if(mca_btl_mvapi_component.use_srq) {
OPAL_THREAD_ADD32((int32_t*) &mvapi_btl->srd_posted_hp, -1);
MCA_BTL_MVAPI_POST_SRR_HIGH(mvapi_btl, 0);
} else {
#endif
OPAL_THREAD_ADD32((int32_t*) &endpoint->rd_posted_hp, -1);
MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, 0);
/* check to see if we need to progress any pending desciptors */
if( OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp, credits) > 0) {
while(!opal_list_is_empty(&endpoint->pending_frags_hp) &&
endpoint->sd_wqe_hp > 0 && endpoint->sd_tokens_hp > 0) {
opal_list_item_t *frag_item;
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_frags_hp));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_mvapi_frag_t *) frag_item))
break;
if(OMPI_SUCCESS != mca_btl_mvapi_endpoint_send(frag->endpoint, frag)) {
BTL_ERROR(("error in posting pending send\n"));
break;
}
}
}
/* check to see if we need to return credits */
if( endpoint->rd_credits_hp >= mca_btl_mvapi_component.rd_win &&
OPAL_THREAD_ADD32(&endpoint->sd_credits_hp, 1) == 1) {
mca_btl_mvapi_endpoint_send_credits_hp(endpoint);
}
#ifdef VAPI_FEATURE_SRQ
}
#endif
ret = mca_btl_mvapi_handle_incoming_hp(mvapi_btl,
frag->endpoint, frag, comp.byte_len);
if (ret != OMPI_SUCCESS)
return ret;
count++;
break;
case VAPI_CQE_SQ_RDMA_READ:
case VAPI_CQE_SQ_RDMA_WRITE:
default:
BTL_ERROR(("Unhandled work completion opcode is %d", comp.opcode));
break;

Просмотреть файл

@ -63,47 +63,52 @@ static inline int mca_btl_mvapi_endpoint_post_send(
mca_btl_mvapi_module_t* mvapi_btl,
mca_btl_mvapi_endpoint_t * endpoint,
mca_btl_mvapi_frag_t * frag)
{
{
int do_rdma = 0;
VAPI_qp_hndl_t qp_hndl;
int ret;
if(frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY && frag->size <= mvapi_btl->super.btl_eager_limit){
if(frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY &&
frag->size <= mvapi_btl->super.btl_eager_limit){
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
opal_list_append(&endpoint->pending_frags_hp, (opal_list_item_t *)frag);
return OMPI_SUCCESS;
/* check for a token */
} else if(!mca_btl_mvapi_component.use_srq &&
OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,1);
opal_list_append(&endpoint->pending_frags_hp, (opal_list_item_t *)frag);
return OMPI_SUCCESS;
} else if( mca_btl_mvapi_component.use_srq &&
OPAL_THREAD_ADD32(&mvapi_btl->sd_tokens_hp,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
OPAL_THREAD_ADD32(&mvapi_btl->sd_tokens_hp,1);
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock);
opal_list_append(&mvapi_btl->pending_frags_hp, (opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock);
return OMPI_SUCCESS;
/* queue the request */
} else {
frag->hdr->credits = (endpoint->rd_credits_hp > 0) ? endpoint->rd_credits_hp : 0;
OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits);
qp_hndl = endpoint->lcl_qp_hndl_hp;
}
/* check for rdma tocken */
if (OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,1);
/* check for a token */
if(!mca_btl_mvapi_component.use_srq &&
OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
OPAL_THREAD_ADD32(&endpoint->sd_tokens_hp,1);
opal_list_append(&endpoint->pending_frags_hp,
(opal_list_item_t *)frag);
return OMPI_SUCCESS;
} else if( mca_btl_mvapi_component.use_srq &&
OPAL_THREAD_ADD32(&mvapi_btl->sd_tokens_hp,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe_hp,1);
OPAL_THREAD_ADD32(&mvapi_btl->sd_tokens_hp,1);
OPAL_THREAD_LOCK(&mvapi_btl->ib_lock);
opal_list_append(&mvapi_btl->pending_frags_hp, (opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&mvapi_btl->ib_lock);
return OMPI_SUCCESS;
}
} else {
do_rdma = 1;
}
frag->hdr->credits =
(endpoint->rd_credits_hp > 0) ? endpoint->rd_credits_hp : 0;
OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits);
frag->hdr->rdma_credits = endpoint->eager_rdma_local.credits;
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
-frag->hdr->rdma_credits);
qp_hndl = endpoint->lcl_qp_hndl_hp;
} else {
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe_lp,-1) < 0) {
@ -138,10 +143,37 @@ static inline int mca_btl_mvapi_endpoint_post_send(
}
}
frag->sr_desc.opcode = VAPI_SEND;
frag->sr_desc.remote_qkey = 0;
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr;
frag->sg_entry.len = frag->segment.seg_len + sizeof(mca_btl_mvapi_header_t);
frag->sg_entry.len =
frag->segment.seg_len + sizeof(mca_btl_mvapi_header_t) +
(do_rdma ? sizeof(mca_btl_mvapi_footer_t) : 0);
if(do_rdma) {
mca_btl_mvapi_footer_t* ftr =
(mca_btl_mvapi_footer_t*)(((char*)frag->segment.seg_addr.pval) +
frag->segment.seg_len);
frag->sr_desc.opcode = VAPI_RDMA_WRITE;
MCA_BTL_MVAPI_RDMA_FRAG_SET_SIZE(ftr, frag->sg_entry.len);
MCA_BTL_MVAPI_RDMA_MAKE_LOCAL(ftr);
#ifdef OMPI_ENABLE_DEBUG
ftr->seq = endpoint->eager_rdma_remote.seq++;
#endif
frag->sr_desc.r_key = (VAPI_rkey_t)endpoint->eager_rdma_remote.rkey;
frag->sr_desc.remote_addr = (VAPI_virt_addr_t)
endpoint->eager_rdma_remote.base.lval +
endpoint->eager_rdma_remote.head *
mvapi_btl->eager_rdma_frag_size +
sizeof(mca_btl_mvapi_frag_t) +
sizeof(mca_btl_mvapi_header_t) +
frag->size +
sizeof(mca_btl_mvapi_footer_t);
frag->sr_desc.remote_addr -= frag->sg_entry.len;
MCA_BTL_MVAPI_RDMA_NEXT_INDEX (endpoint->eager_rdma_remote.head);
} else {
frag->sr_desc.opcode = VAPI_SEND;
}
if(frag->sg_entry.len <= mvapi_btl->ib_inline_max) {
ret = EVAPI_post_inline_sr(mvapi_btl->nic, qp_hndl, &frag->sr_desc);
@ -209,6 +241,14 @@ static void mca_btl_mvapi_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
endpoint->sd_tokens_lp = mca_btl_mvapi_component.rd_num;
endpoint->get_tokens = mca_btl_mvapi_component.ib_qp_ous_rd_atom;
/* initialize RDMA eager related parts */
endpoint->eager_recv_count = 0;
memset(&endpoint->eager_rdma_remote, 0,
sizeof(mca_btl_mvapi_eager_rdma_remote_t));
memset (&endpoint->eager_rdma_local, 0,
sizeof(mca_btl_mvapi_eager_rdma_local_t));
OBJ_CONSTRUCT(&endpoint->eager_rdma_local.lock, opal_mutex_t);
endpoint->rem_info.rem_qp_num_hp = 0;
endpoint->rem_info.rem_qp_num_lp = 0;
endpoint->rem_info.rem_lid = 0;
@ -1009,10 +1049,12 @@ void mca_btl_mvapi_endpoint_send_credits_lp(
frag->hdr->tag = MCA_BTL_TAG_BTL;
frag->hdr->credits = endpoint->rd_credits_lp;
OPAL_THREAD_ADD32(&endpoint->rd_credits_lp, -frag->hdr->credits);
((mca_btl_mvapi_control_header_t *)frag->segment.seg_addr.pval)->type = MCA_BTL_MVAPI_CONTROL_NOOP;
frag->sr_desc.opcode = VAPI_SEND;
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr;
frag->sg_entry.len = sizeof(mca_btl_mvapi_header_t);
frag->sg_entry.len = sizeof(mca_btl_mvapi_header_t) +
sizeof(mca_btl_mvapi_control_header_t);
if(sizeof(mca_btl_mvapi_header_t) <= mvapi_btl->ib_inline_max) {
ret = EVAPI_post_inline_sr(mvapi_btl->nic, endpoint->lcl_qp_hndl_lp, &frag->sr_desc);
@ -1046,7 +1088,9 @@ static void mca_btl_mvapi_endpoint_credits_hp(
/* check to see if there are addditional credits to return */
if ((credits = OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,-1)) > 0) {
OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,-credits);
if (endpoint->rd_credits_hp >= mca_btl_mvapi_component.rd_win &&
if ((endpoint->rd_credits_hp >= mca_btl_mvapi_component.rd_win ||
endpoint->eager_rdma_local.credits >=
mca_btl_mvapi_component.rd_win) &&
OPAL_THREAD_ADD32(&endpoint->sd_credits_hp,1) == 1) {
mca_btl_mvapi_endpoint_send_credits_hp(endpoint);
}
@ -1076,12 +1120,19 @@ void mca_btl_mvapi_endpoint_send_credits_hp(
frag->endpoint = endpoint;
frag->hdr->tag = MCA_BTL_TAG_BTL;
frag->hdr->credits = endpoint->rd_credits_hp;
frag->hdr->credits =
(endpoint->rd_credits_hp > 0) ? endpoint->rd_credits_hp: 0;
OPAL_THREAD_ADD32(&endpoint->rd_credits_hp, -frag->hdr->credits);
frag->hdr->rdma_credits = endpoint->eager_rdma_local.credits;
OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
-frag->hdr->rdma_credits);
((mca_btl_mvapi_control_header_t *)frag->segment.seg_addr.pval)->type = MCA_BTL_MVAPI_CONTROL_NOOP;
frag->sr_desc.opcode = VAPI_SEND;
frag->sg_entry.addr = (VAPI_virt_addr_t) (MT_virt_addr_t) frag->hdr;
frag->sg_entry.len = sizeof(mca_btl_mvapi_header_t);
frag->sg_entry.len = sizeof(mca_btl_mvapi_header_t) +
sizeof(mca_btl_mvapi_control_header_t);
if(sizeof(mca_btl_mvapi_header_t) <= mvapi_btl->ib_inline_max) {
ret = EVAPI_post_inline_sr(mvapi_btl->nic, endpoint->lcl_qp_hndl_hp, &frag->sr_desc);
@ -1097,3 +1148,102 @@ void mca_btl_mvapi_endpoint_send_credits_hp(
}
}
static void mca_btl_mvapi_endpoint_eager_rdma(
mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor,
int status)
{
MCA_BTL_IB_FRAG_RETURN_EAGER((mca_btl_mvapi_module_t*)btl,
(mca_btl_mvapi_frag_t*)descriptor);
}
static int mca_btl_mvapi_endpoint_send_eager_rdma(
mca_btl_base_endpoint_t* endpoint)
{
mca_btl_mvapi_module_t* mvapi_btl = endpoint->endpoint_btl;
mca_btl_mvapi_eager_rdma_header_t *rdma_hdr;
mca_btl_mvapi_frag_t* frag;
struct ibv_send_wr* bad_wr;
int rc;
MCA_BTL_IB_FRAG_ALLOC_EAGER(mvapi_btl, frag, rc);
if(NULL == frag) {
BTL_ERROR(("error allocating fragment"));
return -1;
}
frag->base.des_cbfunc = mca_btl_mvapi_endpoint_eager_rdma;
frag->base.des_cbdata = NULL;
frag->endpoint = endpoint;
frag->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY;
frag->hdr->tag = MCA_BTL_TAG_BTL;
rdma_hdr = (mca_btl_mvapi_eager_rdma_header_t*)frag->segment.seg_addr.pval;
rdma_hdr->control.type = MCA_BTL_MVAPI_CONTROL_RDMA;
rdma_hdr->rkey = endpoint->eager_rdma_local.reg->r_key;
rdma_hdr->rdma_start.pval = endpoint->eager_rdma_local.base.pval;
frag->segment.seg_len = sizeof(mca_btl_mvapi_eager_rdma_header_t);
if (mca_btl_mvapi_endpoint_post_send(mvapi_btl, endpoint, frag) !=
OMPI_SUCCESS) {
MCA_BTL_IB_FRAG_RETURN_EAGER(mvapi_btl, frag);
BTL_ERROR(("Error sending RDMA buffer", strerror(errno)));
return -1;
}
return 0;
}
/* create RDMA buffer for eager messages */
void mca_btl_mvapi_endpoint_connect_eager_rdma(
mca_btl_mvapi_endpoint_t* endpoint)
{
mca_btl_mvapi_module_t* mvapi_btl = endpoint->endpoint_btl;
mca_btl_mvapi_eager_rdma_local_t *eager_rdma;
char *buf;
int i;
OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
if (endpoint->eager_rdma_local.base.pval)
goto unlock_rdma_local;
buf = mvapi_btl->super.btl_mpool->mpool_alloc(mvapi_btl->super.btl_mpool,
mvapi_btl->eager_rdma_frag_size *
mca_btl_mvapi_component.eager_rdma_num, 0, 0,
(mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);
if(!buf)
goto unlock_rdma_local;
for(i = 0; i < mca_btl_mvapi_component.eager_rdma_num; i++) {
ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf +
i*mvapi_btl->eager_rdma_frag_size);
item->user_data = endpoint->eager_rdma_local.reg;
OBJ_CONSTRUCT(item, mca_btl_mvapi_recv_frag_eager_t);
((mca_btl_mvapi_frag_t*)item)->endpoint = endpoint;
((mca_btl_mvapi_frag_t*)item)->type = MCA_BTL_MVAPI_FRAG_EAGER_RDMA;
}
OPAL_THREAD_LOCK(&mvapi_btl->eager_rdma_lock);
if(orte_pointer_array_add (&endpoint->eager_rdma_index,
mvapi_btl->eager_rdma_buffers, endpoint) < 0)
goto cleanup;
endpoint->eager_rdma_local.base.pval = buf;
mvapi_btl->eager_rdma_buffers_count++;
if (mca_btl_mvapi_endpoint_send_eager_rdma(endpoint) == 0) {
OPAL_THREAD_UNLOCK(&mvapi_btl->eager_rdma_lock);
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
return;
}
mvapi_btl->eager_rdma_buffers_count--;
endpoint->eager_rdma_local.base.pval = NULL;
orte_pointer_array_set_item(mvapi_btl->eager_rdma_buffers,
endpoint->eager_rdma_index, NULL);
cleanup:
OPAL_THREAD_UNLOCK(&mvapi_btl->eager_rdma_lock);
mvapi_btl->super.btl_mpool->mpool_free(mvapi_btl->super.btl_mpool,
buf, (mca_mpool_base_registration_t*)eager_rdma->reg);
unlock_rdma_local:
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
}

Просмотреть файл

@ -25,6 +25,8 @@
#include "ompi/mca/btl/btl.h"
#include "btl_mvapi_frag.h"
#include "btl_mvapi.h"
#include "btl_mvapi_eager_rdma.h"
#include "ompi/mca/mpool/mvapi/mpool_mvapi.h"
#include <vapi.h>
#include <mtl_common.h>
@ -148,9 +150,13 @@ struct mca_btl_base_endpoint_t {
int32_t sd_wqe_lp; /**< number of available low priority send wqe entries */
uint32_t subnet;
#if 0
mca_btl_mvapi_rdma_buf_t *rdma_buf;
#endif
uint32_t eager_recv_count; /**< number of eager received */
mca_btl_mvapi_eager_rdma_remote_t eager_rdma_remote;
/**< info about remote RDMA buffer */
mca_btl_mvapi_eager_rdma_local_t eager_rdma_local;
/**< info about local RDMA buffer */
size_t eager_rdma_index; /**< index into RDMA buffers pointer array */
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
@ -160,7 +166,7 @@ int mca_btl_mvapi_endpoint_connect(mca_btl_base_endpoint_t*);
void mca_btl_mvapi_endpoint_send_credits_hp(mca_btl_base_endpoint_t*);
void mca_btl_mvapi_endpoint_send_credits_lp(mca_btl_base_endpoint_t*);
void mca_btl_mvapi_post_recv(void);
void mca_btl_mvapi_endpoint_connect_eager_rdma(mca_btl_mvapi_endpoint_t*);
#define MCA_BTL_MVAPI_ENDPOINT_POST_RR_HIGH(endpoint, \
additional) \

Просмотреть файл

@ -78,7 +78,8 @@ static void mca_btl_mvapi_recv_frag_common_constructor(mca_btl_mvapi_frag_t* fra
static void mca_btl_mvapi_send_frag_eager_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.eager_limit;
frag->size = mca_btl_mvapi_component.eager_limit;
frag->type = MCA_BTL_MVAPI_FRAG_EAGER;
mca_btl_mvapi_send_frag_common_constructor(frag);
}
@ -86,13 +87,15 @@ static void mca_btl_mvapi_send_frag_eager_constructor(mca_btl_mvapi_frag_t* frag
static void mca_btl_mvapi_send_frag_max_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.max_send_size;
frag->size = mca_btl_mvapi_component.max_send_size;
frag->type = MCA_BTL_MVAPI_FRAG_MAX;
mca_btl_mvapi_send_frag_common_constructor(frag);
}
static void mca_btl_mvapi_recv_frag_max_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.max_send_size;
frag->size = mca_btl_mvapi_component.max_send_size;
frag->type = MCA_BTL_MVAPI_FRAG_MAX;
mca_btl_mvapi_recv_frag_common_constructor(frag);
}
@ -101,14 +104,18 @@ static void mca_btl_mvapi_recv_frag_max_constructor(mca_btl_mvapi_frag_t* frag)
static void mca_btl_mvapi_recv_frag_eager_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = mca_btl_mvapi_component.eager_limit;
frag->type = MCA_BTL_MVAPI_FRAG_EAGER;
mca_btl_mvapi_recv_frag_common_constructor(frag);
frag->ftr = (mca_btl_mvapi_footer_t*)((char*)frag->segment.seg_addr.pval
+ frag->size);
MCA_BTL_MVAPI_RDMA_MAKE_REMOTE(frag->ftr);
}
static void mca_btl_mvapi_send_frag_frag_constructor(mca_btl_mvapi_frag_t* frag)
{
frag->size = 0;
frag->type = MCA_BTL_MVAPI_FRAG_FRAG;
mca_btl_mvapi_send_frag_common_constructor(frag);
}

Просмотреть файл

@ -37,9 +37,46 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_mvapi_frag_t);
struct mca_btl_mvapi_header_t {
mca_btl_base_tag_t tag;
int16_t credits;
int16_t rdma_credits;
};
typedef struct mca_btl_mvapi_header_t mca_btl_mvapi_header_t;
struct mca_btl_mvapi_footer_t {
#ifdef OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
union {
uint32_t size;
uint8_t buf[4];
} u;
};
typedef struct mca_btl_mvapi_footer_t mca_btl_mvapi_footer_t;
typedef enum {
MCA_BTL_MVAPI_CONTROL_NOOP,
MCA_BTL_MVAPI_CONTROL_RDMA
} mca_btl_mvapi_control_t;
struct mca_btl_mvapi_control_header_t {
mca_btl_mvapi_control_t type;
};
typedef struct mca_btl_mvapi_control_header_t mca_btl_mvapi_control_header_t;
struct mca_btl_mvapi_eager_rdma_header_t {
mca_btl_mvapi_control_header_t control;
ompi_ptr_t rdma_start;
uint64_t rkey;
};
typedef struct mca_btl_mvapi_eager_rdma_header_t mca_btl_mvapi_eager_rdma_header_t;
enum mca_btl_mvapi_frag_type_t {
MCA_BTL_MVAPI_FRAG_EAGER,
MCA_BTL_MVAPI_FRAG_MAX,
MCA_BTL_MVAPI_FRAG_FRAG,
MCA_BTL_MVAPI_FRAG_EAGER_RDMA
};
typedef enum mca_btl_mvapi_frag_type_t mca_btl_mvapi_frag_type_t;
/**
* IB send fragment derived type.
@ -50,13 +87,15 @@ struct mca_btl_mvapi_frag_t {
struct mca_btl_base_endpoint_t *endpoint;
size_t size;
int rc;
mca_btl_mvapi_frag_type_t type;
union{
VAPI_rr_desc_t rr_desc;
VAPI_sr_desc_t sr_desc;
};
VAPI_sg_lst_entry_t sg_entry;
mca_btl_mvapi_header_t *hdr;
mca_btl_mvapi_footer_t *ftr;
mca_mpool_mvapi_registration_t * vapi_reg;
};
typedef struct mca_btl_mvapi_frag_t mca_btl_mvapi_frag_t;