1
1

The latest and greatest Elan improvements.

This commit was SVN r17361.
Этот коммит содержится в:
George Bosilca 2008-02-01 21:29:57 +00:00
родитель 982acaa2c9
Коммит 3a6d2e3894
3 изменённых файлов: 172 добавлений и 84 удалений

Просмотреть файл

@ -81,19 +81,19 @@ static int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
mca_btl_base_error_no_nics( "ELAN", "Quadrics" );
return OMPI_ERR_OUT_OF_RESOURCE;
}
elan_btl->base = base;
/* Create the global queue */
if( (elan_btl->global_queue = elan_gallocQueue(base, base->allGroup)) == NULL ) {
elan_btl->base = base;
elan_btl->elan_vp = base->state->vp;
/* Create the tport global queue */
if( (elan_btl->tport_queue = elan_gallocQueue(base, base->allGroup)) == NULL ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* Create and initialize the tport */
if( !(elan_btl->tport = elan_tportInit(base->state,
elan_btl->global_queue,
elan_btl->tport_queue,
mca_btl_elan_component.elan_max_posted_recv,
base->tport_smallmsg,
mca_btl_elan_module.super.btl_rndv_eager_limit,
mca_btl_elan_module.super.btl_eager_limit,
base->tport_stripemsg,
ELAN_POLL_EVENT,
base->retryCount,
@ -105,11 +105,17 @@ static int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
}
/* Create the receive queue */
if( (elan_btl->global_queue = elan_gallocQueue(base, base->allGroup)) == NULL ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
mca_btl_elan_component.queue_max_size = elan_queueMaxSlotSize( base->state )
- sizeof(mca_btl_elan_hdr_t);
elan_btl->rx_queue =
elan_queueRxInit( base->state, /* ELAN_STATE *state */
elan_btl->global_queue, /* ELAN_QUEUE *queue */
mca_btl_elan_component.elan_max_posted_recv, /* int nSlots */
mca_btl_elan_module.super.btl_eager_limit, /* int slotSize */
(int)mca_btl_elan_component.queue_max_size, /* int slotSize */
ELAN_RAIL_ALL, /* int rail */
(ELAN_TPORT_SHM_DISABLE |
ELAN_TPORT_USERCOPY_DISABLE) /* ELAN_FLAGS flags */);
@ -152,6 +158,27 @@ static int mca_btl_elan_add_procs( struct mca_btl_base_module_t* btl,
peers[i] = elan_endpoint;
}
for( i = 0; i < mca_btl_elan_component.elan_max_posted_recv; i++ ) {
mca_btl_elan_frag_t* frag;
MCA_BTL_ELAN_FRAG_ALLOC_EAGER(frag, rc );
if( NULL == frag ) {
return OMPI_ERROR;
}
frag->segment.seg_addr.pval = (void*)(frag + 1);
frag->base.des_dst = &(frag->segment);
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->type = MCA_BTL_ELAN_HDR_TYPE_RECV;
frag->elan_event = elan_tportRxStart( elan_btl->tport,
ELAN_TPORT_RXBUF | ELAN_TPORT_RXANY,
0, 0, 0, 0,
frag->segment.seg_addr.pval,
mca_btl_elan_module.super.btl_eager_limit );
opal_list_append( &(elan_btl->recv_list), (opal_list_item_t*)frag );
}
/* enable the network */
elan_enable_network( elan_btl->base->state );
@ -191,24 +218,26 @@ mca_btl_elan_alloc( struct mca_btl_base_module_t* btl,
size_t size,
uint32_t flags )
{
mca_btl_elan_frag_t* frag;
mca_btl_elan_frag_t* frag = NULL;
ptrdiff_t hdr_skip = 0;
int rc;
if( size <= btl->btl_eager_limit ) {
MCA_BTL_ELAN_FRAG_ALLOC_EAGER(frag, rc);
if( OPAL_UNLIKELY(NULL == frag) ) {
if( NULL == frag ) {
return NULL;
}
if( size <= mca_btl_elan_component.queue_max_size ) { /* This will be going over the queue */
hdr_skip = sizeof(mca_btl_elan_hdr_t);
}
} else if( size <= btl->btl_max_send_size ) {
MCA_BTL_ELAN_FRAG_ALLOC_MAX(frag, rc);
if( OPAL_UNLIKELY(NULL == frag) ) {
return NULL;
}
} else {
MCA_BTL_ELAN_FRAG_ALLOC_MAX(frag, rc);
}
if( OPAL_UNLIKELY(NULL == frag) ) {
return NULL;
}
frag->segment.seg_addr.pval = (void*)((char*)(frag + 1)
+ sizeof(mca_btl_elan_hdr_t));
frag->segment.seg_addr.pval = (void*)((char*)(frag + 1) + hdr_skip);
frag->segment.seg_len = size;
frag->base.des_src = &(frag->segment);
frag->base.des_src_cnt = 1;
@ -262,49 +291,43 @@ mca_btl_elan_prepare_src( struct mca_btl_base_module_t* btl,
mca_btl_elan_frag_t* frag;
struct iovec iov;
uint32_t iov_count = 1;
size_t max_data = *size;
size_t max_data = *size, skip = 0;
int rc;
if( OPAL_UNLIKELY(max_data > UINT32_MAX) ) {
max_data = (size_t)UINT32_MAX;
}
if( max_data+reserve <= btl->btl_eager_limit ) {
MCA_BTL_ELAN_FRAG_ALLOC_EAGER(frag, rc);
if( NULL == frag ) {
return NULL;
if( 0 != reserve ) {
if( max_data + reserve <= btl->btl_eager_limit ) {
MCA_BTL_ELAN_FRAG_ALLOC_EAGER(frag, rc);
if( NULL == frag ) {
return NULL;
}
if( (max_data + reserve) <= mca_btl_elan_component.queue_max_size ) {
skip = sizeof(mca_btl_elan_hdr_t);
}
} else {
MCA_BTL_ELAN_FRAG_ALLOC_MAX(frag, rc);
if( NULL == frag ) {
return NULL;
}
if( (max_data + reserve) > btl->btl_max_send_size ) {
max_data = btl->btl_max_send_size - reserve;
}
}
frag->segment.seg_addr.pval = (void*)((unsigned char*)(frag + 1) + skip);
iov.iov_len = max_data;
iov.iov_base = (void*)((unsigned char*) frag->segment.seg_addr.pval + reserve);
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data );
*size = max_data;
if( rc < 0 ) {
MCA_BTL_ELAN_FRAG_RETURN(frag);
return NULL;
}
frag->segment.seg_addr.pval = frag+1;
frag->segment.seg_len = max_data + reserve;
}
else if( max_data+reserve <= btl->btl_max_send_size ) {
MCA_BTL_ELAN_FRAG_ALLOC_MAX(frag, rc);
if( NULL == frag ) {
return NULL;
}
if(max_data + reserve > btl->btl_max_send_size){
max_data = btl->btl_max_send_size - reserve;
}
iov.iov_len = max_data;
iov.iov_base = (unsigned char*) frag->segment.seg_addr.pval + reserve;
iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data );
*size = max_data;
if( rc < 0 ) {
MCA_BTL_ELAN_FRAG_RETURN(frag);
return NULL;
}
frag->segment.seg_addr.pval = frag+1;
*size = max_data;
frag->segment.seg_len = max_data + reserve;
} else {
} else { /* this is a real RDMA operation */
MCA_BTL_ELAN_FRAG_ALLOC_USER(frag, rc);
if(NULL == frag) {
return NULL;
@ -317,6 +340,7 @@ mca_btl_elan_prepare_src( struct mca_btl_base_module_t* btl,
frag->segment.seg_addr.pval = iov.iov_base;
frag->segment.seg_len = max_data;
}
frag->base.des_src = &(frag->segment);
frag->base.des_src_cnt = 1;
frag->base.order = MCA_BTL_NO_ORDER;
@ -352,20 +376,24 @@ mca_btl_elan_prepare_dst( struct mca_btl_base_module_t* btl,
uint32_t flags )
{
mca_btl_elan_frag_t* frag;
size_t origin, position = *size;
int rc;
if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) {
if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) {
*size = (size_t)UINT32_MAX;
}
MCA_BTL_ELAN_FRAG_ALLOC_USER(frag, rc);
if( NULL == frag ) {
return NULL;
}
ompi_convertor_get_current_pointer( convertor, (void**)&(frag->segment.seg_addr.pval) );
origin = convertor->bConverted;
position += origin;
ompi_convertor_set_position( convertor, &position );
*size = position - origin;
frag->segment.seg_len = *size;
frag->segment.seg_key.key64 = (uint64_t)(intptr_t)convertor;
/*frag->segment.seg_addr.pval = convertor->pBaseBuf + convertor->bConverted;*/
frag->type = MCA_BTL_ELAN_HDR_TYPE_PUT;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->base.des_flags = 0;
@ -401,23 +429,34 @@ static int mca_btl_elan_send( struct mca_btl_base_module_t* btl,
frag->tag = tag;
frag->type = MCA_BTL_ELAN_HDR_TYPE_SEND;
elan_hdr->tag = (int)tag;
elan_hdr->length = (int)frag->segment.seg_len;
send_len = frag->segment.seg_len + sizeof(mca_btl_elan_hdr_t);
frag->elan_event = elan_queueTx( elan_btl->tx_queue,
endpoint->elan_vp,
(void*)elan_hdr,
send_len, ELAN_RAIL_ALL );
if( OPAL_UNLIKELY(NULL == frag->elan_event) ) {
opal_output( 0, "elan_queueTx failed for destination %d\n", endpoint->elan_vp );
return OMPI_ERROR;
if( frag->segment.seg_len <= mca_btl_elan_component.queue_max_size ) {
elan_hdr->tag = (int)tag;
elan_hdr->length = (int)frag->segment.seg_len;
send_len = frag->segment.seg_len + sizeof(mca_btl_elan_hdr_t);
frag->elan_event = elan_queueTx( elan_btl->tx_queue,
endpoint->elan_vp,
(void*)elan_hdr,
send_len, ELAN_RAIL_ALL );
if( OPAL_UNLIKELY(NULL == frag->elan_event) ) {
opal_output( 0, "elan_queueTx failed for destination %d\n", endpoint->elan_vp );
return OMPI_ERROR;
}
if( elan_poll( frag->elan_event, 0 ) ) {
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint,
&(frag->base), OMPI_SUCCESS );
return OMPI_SUCCESS;
}
} else {
frag->elan_event = elan_tportTxStart( elan_btl->tport, 0, endpoint->elan_vp,
elan_btl->elan_vp, frag->tag,
(void*)elan_hdr, frag->segment.seg_len );
if( elan_tportTxDone(frag->elan_event) ) {
elan_tportTxWait(frag->elan_event);
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint,
&(frag->base), OMPI_SUCCESS );
return OMPI_SUCCESS;
}
}
if( elan_poll( frag->elan_event, 0 ) ) {
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint,
&(frag->base), OMPI_SUCCESS );
return OMPI_SUCCESS;
}
/* Add the fragment to the pending send list */
opal_list_append( &(elan_btl->send_list), (opal_list_item_t*)frag );
return OMPI_SUCCESS;
@ -469,16 +508,18 @@ static int mca_btl_elan_get( mca_btl_base_module_t* btl,
{
mca_btl_elan_module_t* elan_btl = (mca_btl_elan_module_t*) btl;
mca_btl_elan_frag_t* frag = (mca_btl_elan_frag_t*) des;
int peer = endpoint->elan_vp;
int peer = endpoint->elan_vp;
mca_btl_base_segment_t* src = des->des_src;
mca_btl_base_segment_t* dst = des->des_dst;
unsigned char* src_addr = (unsigned char*)src->seg_addr.pval;
size_t src_len = src->seg_len;
unsigned char* dst_addr = (unsigned char*)ompi_ptr_ltop(dst->seg_addr.lval);
unsigned char* dst_addr = (unsigned char*)dst->seg_addr.lval;
frag->endpoint = endpoint;
frag->btl = elan_btl;
frag->type = MCA_BTL_ELAN_HDR_TYPE_GET;
opal_output( 0, "elan_get( remote %p, local %p, length %d, peer %d )\n",
(void*)src_addr, (void*)dst_addr, (int)src_len, peer );
frag->elan_event = elan_get(elan_btl->base->state, src_addr, dst_addr, src_len, peer);
/* Add the fragment to the pending RDMA list */
opal_list_append( &(elan_btl->rdma_list), (opal_list_item_t*)frag );
@ -505,10 +546,13 @@ int mca_btl_elan_finalize( struct mca_btl_base_module_t* btl )
}
mca_btl_elan_component.elan_num_btls--;
/* Cancel all posted receives */
/* Release the internal structures */
OBJ_DESTRUCT(&elan_btl->elan_lock);
OBJ_DESTRUCT(&elan_btl->recv_list);
OBJ_DESTRUCT(&elan_btl->send_list);
OBJ_DESTRUCT(&elan_btl->rdma_list);
OBJ_DESTRUCT(&elan_btl->elan_lock);
/* The BTL is clean, remove it */
free(elan_btl);

Просмотреть файл

@ -43,8 +43,8 @@ BEGIN_C_DECLS
struct mca_btl_elan_component_t {
mca_btl_base_component_1_0_1_t super; /**< base BTL component */
uint32_t ib_max_btls;
/**< maximum number of hcas available to the ELAN component */
size_t queue_max_size;
/**< maximum amount of data transfered using the queues */
uint32_t elan_num_btls;
/**< number of hcas available to the ELAN component */
@ -87,14 +87,18 @@ OMPI_MODULE_DECLSPEC extern mca_btl_elan_component_t mca_btl_elan_component;
struct mca_btl_elan_module_t {
mca_btl_base_module_t super; /**< base BTL interface */
int expect_tport_recv;
int elan_vp;
ELAN_BASE* base;
ELAN_TPORT* tport;
ELAN_QUEUE* global_queue; /**< The global queue */
ELAN_QUEUE* tport_queue;
ELAN_QUEUE_RX* rx_queue; /**< The local receive queue */
ELAN_QUEUE_TX* tx_queue; /**< The global send queue */
opal_mutex_t elan_lock;
opal_list_t send_list; /**< list of posted sends */
opal_list_t rdma_list; /**< list of posted receives */
opal_list_t recv_list;
};
typedef struct mca_btl_elan_module_t mca_btl_elan_module_t;
extern mca_btl_elan_module_t mca_btl_elan_module;

Просмотреть файл

@ -90,28 +90,19 @@ int mca_btl_elan_component_open(void)
mca_btl_elan_component.elan_num_btls = 0;
mca_btl_elan_component.elan_btls = NULL;
mca_btl_elan_module.super.btl_exclusivity = 0;
mca_btl_elan_module.super.btl_eager_limit = 2*1024 - sizeof(mca_btl_elan_hdr_t);
mca_btl_elan_module.super.btl_rndv_eager_limit = 32*1024 - sizeof(mca_btl_elan_hdr_t);
mca_btl_elan_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
mca_btl_elan_module.super.btl_eager_limit = 32*1024;
mca_btl_elan_module.super.btl_rndv_eager_limit = mca_btl_elan_module.super.btl_eager_limit;
mca_btl_elan_module.super.btl_max_send_size = 64*1024; /*64*1024;*/
mca_btl_elan_module.super.btl_rdma_pipeline_send_length = 512 * 1024;
mca_btl_elan_module.super.btl_rdma_pipeline_frag_size = 128 * 1024;
mca_btl_elan_module.super.btl_min_rdma_pipeline_size = 128 * 1024;
mca_btl_elan_module.super.btl_flags = /* MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_GET |*/ MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND;
/* mca_btl_elan_module.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE|MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND ;*/
mca_btl_elan_module.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND;
mca_btl_elan_module.super.btl_bandwidth = 1959;
mca_btl_elan_module.super.btl_latency = 4;
mca_btl_base_param_register(&mca_btl_elan_component.super.btl_version,
&mca_btl_elan_module.super);
/* register Elan4 component parameters */
mca_btl_elan_component.elan_free_list_num =
mca_btl_elan_param_register_int ("free_list_num", 8);
mca_btl_elan_component.elan_free_list_max =
mca_btl_elan_param_register_int ("free_list_max", 128);
mca_btl_elan_component.elan_free_list_inc =
mca_btl_elan_param_register_int ("free_list_inc", 32);
mca_base_param_reg_string( (mca_base_component_t*)&mca_btl_elan_component, "elanidmap",
"System-wide configuration file for the Quadrics network (elanidmap)",
false, false, "/etc/elanidmap", &mca_btl_elan_component.elanidmap_file );
@ -121,6 +112,16 @@ int mca_btl_elan_component_open(void)
" in performances",
false, false, 128, &mca_btl_elan_component.elan_max_posted_recv );
/* register Elan4 component parameters */
mca_btl_elan_component.elan_free_list_num =
mca_btl_elan_param_register_int( "free_list_num", 8 );
mca_btl_elan_component.elan_free_list_max =
mca_btl_elan_param_register_int( "free_list_max",
(mca_btl_elan_component.elan_free_list_num +
mca_btl_elan_component.elan_max_posted_recv) );
mca_btl_elan_component.elan_free_list_inc =
mca_btl_elan_param_register_int( "free_list_inc", 32 );
return OMPI_SUCCESS;
}
@ -238,6 +239,9 @@ mca_btl_elan_component_init( int *num_btl_modules,
OBJ_CONSTRUCT( &btl->elan_lock, opal_mutex_t );
OBJ_CONSTRUCT( &btl->send_list, opal_list_t );
OBJ_CONSTRUCT( &btl->rdma_list, opal_list_t );
OBJ_CONSTRUCT( &btl->recv_list, opal_list_t );
btl->expect_tport_recv = 1;
mca_btl_elan_component.elan_btls[count++] = btl;
}
@ -284,6 +288,42 @@ int mca_btl_elan_component_progress( void )
elan_queueRxComplete( elan_btl->rx_queue );
num_progressed++;
}
if(elan_btl->expect_tport_recv) { /* There is a pending message on the tport */
mca_btl_elan_frag_t* frag = (mca_btl_elan_frag_t*)opal_list_get_first( &(elan_btl->recv_list) );
if( elan_done(frag->elan_event, 0) ) {
int tag;
size_t length;
mca_btl_active_message_callback_t* reg;
void* recv_buf;
recv_buf = (mca_btl_elan_hdr_t*)elan_tportRxWait( frag->elan_event,
NULL, &tag, &length );
num_progressed++;
/*elan_btl->expect_tport_recv--;*/
OPAL_THREAD_LOCK(&elan_btl->elan_lock);
opal_list_remove_first( &(elan_btl->recv_list) );
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
frag->base.des_dst->seg_addr.pval = (void*)recv_buf;
frag->base.des_dst->seg_len = length;
frag->tag = (mca_btl_base_tag_t)tag;
reg = mca_btl_base_active_message_trigger + frag->tag;
reg->cbfunc( &(elan_btl->super), frag->tag, &(frag->base), reg->cbdata );
if( recv_buf != (void*)(frag+1) ) {
elan_tportBufFree( elan_btl->tport, recv_buf );
frag->base.des_dst->seg_addr.pval = (void*)(frag+1);
}
OPAL_THREAD_LOCK(&elan_btl->elan_lock);
frag->elan_event = elan_tportRxStart( elan_btl->tport,
ELAN_TPORT_RXBUF | ELAN_TPORT_RXANY,
0, 0, 0, 0,
frag->base.des_dst->seg_addr.pval,
mca_btl_elan_module.super.btl_eager_limit );
opal_list_append( &(elan_btl->recv_list), (opal_list_item_t*)frag );
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
}
}
/* If there are any pending sends check their completion */
if( !opal_list_is_empty( &(elan_btl->send_list) ) ) {
mca_btl_elan_frag_t* frag = (mca_btl_elan_frag_t*)opal_list_get_first( &(elan_btl->send_list) );