fceabb2498
This is a fairly intrusive change, but outside of the moving of opal/event to opal/mca/event, the only changes involved (a) changing all calls to opal_event functions to reflect the new framework instead, and (b) ensuring that all opal_event_t objects are properly constructed since they are now true opal_objects. Note: Shiqing has just returned from vacation and has not yet had a chance to complete the Windows integration. Thus, this commit almost certainly breaks Windows support on the trunk. However, I want this to have a chance to soak for as long as possible before I become less available a week from today (going to be at a class for 5 days, and thus will only be sparingly available) so we can find and fix any problems. Biggest change is moving the libevent code from opal/event to a new opal/mca/event framework. This was done to make it much easier to update libevent in the future. New versions can be inserted as a new component and tested in parallel with the current version until validated, then we can remove the earlier version if we so choose. This is a statically built framework ala installdirs, so only one component will build at a time. There is no selection logic - the sole compiled component simply loads its function pointers into the opal_event struct. I have gone thru the code base and converted all the libevent calls I could find. However, I cannot compile nor test every environment. It is therefore quite likely that errors remain in the system. Please keep an eye open for two things: 1. compile-time errors: these will be obvious as calls to the old functions (e.g., opal_evtimer_new) must be replaced by the new framework APIs (e.g., opal_event.evtimer_new) 2. run-time errors: these will likely show up as segfaults due to missing constructors on opal_event_t objects. It appears that it became a typical practice for people to "init" an opal_event_t by simply using memset to zero it out. This will no longer work - you must either OBJ_NEW or OBJ_CONSTRUCT an opal_event_t. I tried to catch these cases, but may have missed some. Believe me, you'll know when you hit it. There is also the issue of the new libevent "no recursion" behavior. As I described on a recent email, we will have to discuss this and figure out what, if anything, we need to do. This commit was SVN r23925.
370 строки
16 KiB
C
370 строки
16 KiB
C
/*
|
|
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "ompi_config.h"
|
|
#include "ompi/constants.h"
|
|
#include "opal/mca/event/event.h"
|
|
#include "opal/util/output.h"
|
|
#include "ompi/mca/btl/btl.h"
|
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "ompi/runtime/ompi_module_exchange.h"
|
|
|
|
#include "ompi/mca/mpool/base/base.h"
|
|
|
|
#include "btl_elan.h"
|
|
#include "btl_elan_frag.h"
|
|
#include "btl_elan_endpoint.h"
|
|
|
|
#include "ompi/mca/btl/base/base.h"
|
|
#include "ompi/mca/btl/base/btl_base_error.h"
|
|
|
|
#include "elan/elan.h"
|
|
|
|
#include "opal/util/opal_environ.h"
|
|
|
|
#define ELAN_MAX_BTL 10
|
|
|
|
mca_btl_elan_component_t mca_btl_elan_component = {
|
|
{
|
|
/* First, the mca_base_component_t struct containing meta information
|
|
about the component itself */
|
|
|
|
{
|
|
MCA_BTL_BASE_VERSION_2_0_0,
|
|
|
|
"elan", /* MCA component name */
|
|
OMPI_MAJOR_VERSION, /* MCA component major version */
|
|
OMPI_MINOR_VERSION, /* MCA component minor version */
|
|
OMPI_RELEASE_VERSION, /* MCA component release version */
|
|
mca_btl_elan_component_open, /* component open */
|
|
mca_btl_elan_component_close /* component close */
|
|
},
|
|
{
|
|
/* The component is not checkpoint ready */
|
|
MCA_BASE_METADATA_PARAM_NONE
|
|
},
|
|
mca_btl_elan_component_init,
|
|
mca_btl_elan_component_progress,
|
|
}
|
|
};
|
|
|
|
|
|
/*
|
|
* utility routines for parameter registration
|
|
*/
|
|
|
|
static inline int
|
|
mca_btl_elan_param_register_int( const char* param_name,
|
|
int default_value )
|
|
{
|
|
int id = mca_base_param_register_int("btl","elan",param_name,NULL,default_value);
|
|
int param_value = default_value;
|
|
mca_base_param_lookup_int(id,¶m_value);
|
|
return param_value;
|
|
}
|
|
|
|
/*
|
|
* Called by MCA framework to open the component, registers
|
|
* component parameters.
|
|
*/
|
|
|
|
int mca_btl_elan_component_open(void)
|
|
{
|
|
/* initialize state */
|
|
mca_btl_elan_component.elan_num_btls = 0;
|
|
mca_btl_elan_component.elan_btls = NULL;
|
|
|
|
mca_btl_elan_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
|
|
mca_btl_elan_module.super.btl_eager_limit = 32*1024;
|
|
mca_btl_elan_module.super.btl_rndv_eager_limit = mca_btl_elan_module.super.btl_eager_limit;
|
|
mca_btl_elan_module.super.btl_max_send_size = 64*1024; /*64*1024;*/
|
|
mca_btl_elan_module.super.btl_rdma_pipeline_send_length = 512 * 1024;
|
|
mca_btl_elan_module.super.btl_rdma_pipeline_frag_size = 128 * 1024;
|
|
mca_btl_elan_module.super.btl_min_rdma_pipeline_size = 128 * 1024;
|
|
mca_btl_elan_module.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND;
|
|
mca_btl_elan_module.super.btl_bandwidth = 1959;
|
|
mca_btl_elan_module.super.btl_latency = 4;
|
|
mca_btl_base_param_register(&mca_btl_elan_component.super.btl_version,
|
|
&mca_btl_elan_module.super);
|
|
|
|
mca_base_param_reg_string( (mca_base_component_t*)&mca_btl_elan_component, "elanidmap",
|
|
"System-wide configuration file for the Quadrics network (elanidmap)",
|
|
false, false, "/etc/elanidmap", &mca_btl_elan_component.elanidmap_file );
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_elan_component, "max_posted_recv",
|
|
"Number of received posted in advance. Increasing this number for"
|
|
" communication bound application can lead to visible improvement"
|
|
" in performances",
|
|
false, false, 128, &mca_btl_elan_component.elan_max_posted_recv );
|
|
|
|
/* register Elan4 component parameters */
|
|
mca_btl_elan_component.elan_free_list_num =
|
|
mca_btl_elan_param_register_int( "free_list_num", 8 );
|
|
mca_btl_elan_component.elan_free_list_max =
|
|
mca_btl_elan_param_register_int( "free_list_max",
|
|
(mca_btl_elan_component.elan_free_list_num +
|
|
mca_btl_elan_component.elan_max_posted_recv) );
|
|
mca_btl_elan_component.elan_free_list_inc =
|
|
mca_btl_elan_param_register_int( "free_list_inc", 32 );
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* component cleanup - sanity checking of queue lengths
|
|
*/
|
|
|
|
int mca_btl_elan_component_close(void)
|
|
{
|
|
if( NULL != mca_btl_elan_component.elan_btls ) {
|
|
free( mca_btl_elan_component.elan_btls );
|
|
mca_btl_elan_component.elan_btls = NULL;
|
|
mca_btl_elan_component.elan_num_btls = 0;
|
|
|
|
/* release resources */
|
|
OBJ_DESTRUCT(&mca_btl_elan_component.elan_procs);
|
|
OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_eager);
|
|
OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_user);
|
|
OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_max);
|
|
OBJ_DESTRUCT(&mca_btl_elan_component.elan_lock);
|
|
}
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Elan4 component initialization:
|
|
* (1) read interface list from kernel and compare against component parameters
|
|
* then create a BTL instance for selected interfaces
|
|
* (2) setup Elan4 listen socket for incoming connection attempts
|
|
* (3) register BTL parameters with the MCA
|
|
*/
|
|
mca_btl_base_module_t**
|
|
mca_btl_elan_component_init( int *num_btl_modules,
|
|
bool enable_progress_threads,
|
|
bool enable_mpi_threads )
|
|
{
|
|
|
|
mca_btl_base_module_t** btls = NULL;
|
|
|
|
*num_btl_modules = 0;
|
|
|
|
/* There is no support for a progress thread yet. */
|
|
if (enable_progress_threads) {
|
|
ompi_modex_send(&mca_btl_elan_component.super.btl_version, NULL, 0);
|
|
return NULL;
|
|
}
|
|
|
|
OBJ_CONSTRUCT (&mca_btl_elan_component.elan_lock, opal_mutex_t);
|
|
OBJ_CONSTRUCT (&mca_btl_elan_component.elan_frag_eager, ompi_free_list_t);
|
|
OBJ_CONSTRUCT (&mca_btl_elan_component.elan_frag_max, ompi_free_list_t);
|
|
OBJ_CONSTRUCT (&mca_btl_elan_component.elan_frag_user, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_elan_component.elan_procs, opal_list_t);
|
|
ompi_free_list_init_new( &mca_btl_elan_component.elan_frag_eager,
|
|
sizeof(mca_btl_elan_frag_t) + mca_btl_elan_module.super.btl_eager_limit,
|
|
opal_cache_line_size,
|
|
OBJ_CLASS(mca_btl_elan_frag_t),
|
|
0,opal_cache_line_size,
|
|
mca_btl_elan_component.elan_free_list_num,
|
|
mca_btl_elan_component.elan_free_list_max,
|
|
mca_btl_elan_component.elan_free_list_inc,
|
|
NULL ); /* use default allocator */
|
|
|
|
ompi_free_list_init_new( &mca_btl_elan_component.elan_frag_user,
|
|
sizeof(mca_btl_elan_frag_t),
|
|
opal_cache_line_size,
|
|
OBJ_CLASS(mca_btl_elan_frag_t),
|
|
0,opal_cache_line_size,
|
|
mca_btl_elan_component.elan_free_list_num,
|
|
mca_btl_elan_component.elan_free_list_max,
|
|
mca_btl_elan_component.elan_free_list_inc,
|
|
NULL ); /* use default allocator */
|
|
|
|
ompi_free_list_init_new( &mca_btl_elan_component.elan_frag_max,
|
|
sizeof(mca_btl_elan_frag_t)+mca_btl_elan_module.super.btl_max_send_size,
|
|
opal_cache_line_size,
|
|
OBJ_CLASS(mca_btl_elan_frag_t),
|
|
0,opal_cache_line_size,
|
|
mca_btl_elan_component.elan_free_list_num,
|
|
mca_btl_elan_component.elan_free_list_max,
|
|
mca_btl_elan_component.elan_free_list_inc,
|
|
NULL ); /* use default allocator */
|
|
|
|
mca_btl_elan_component.elan_num_btls = ELAN_MAX_BTL; /* no more than that */
|
|
mca_btl_elan_component.elan_btls = calloc( mca_btl_elan_component.elan_num_btls,
|
|
sizeof(mca_btl_base_module_t*) );
|
|
/* Retrieve the positions of the node in the elan network */
|
|
{
|
|
FILE* position;
|
|
char filename[255], file_line[255];
|
|
int index, count = 0, positions[ELAN_MAX_BTL];
|
|
mca_btl_elan_module_t* btl;
|
|
|
|
for( index = 0; index < ELAN_MAX_BTL; index++ ) {
|
|
snprintf( filename, 255, "/proc/qsnet/elan4/device%d/position", index );
|
|
position = fopen( filename, "r" );
|
|
if( NULL == position ) {
|
|
continue;
|
|
}
|
|
if( 0 == fscanf( position, "%s%i", file_line, &positions[count] ) ) {
|
|
opal_output( 0, "Unable to read the network position" );
|
|
continue;
|
|
}
|
|
fclose(position);
|
|
btl = (mca_btl_elan_module_t*)malloc (sizeof (mca_btl_elan_module_t));
|
|
if(NULL == btl) {
|
|
opal_output( 0, "No enough memory to allocate the Elan internal structures" );
|
|
return NULL;
|
|
}
|
|
memcpy( btl, &mca_btl_elan_module, sizeof(mca_btl_elan_module_t) );
|
|
OBJ_CONSTRUCT( &btl->elan_lock, opal_mutex_t );
|
|
OBJ_CONSTRUCT( &btl->send_list, opal_list_t );
|
|
OBJ_CONSTRUCT( &btl->rdma_list, opal_list_t );
|
|
OBJ_CONSTRUCT( &btl->recv_list, opal_list_t );
|
|
|
|
btl->expect_tport_recv = 1;
|
|
btl->elan_position = positions[count];
|
|
|
|
mca_btl_elan_component.elan_btls[count++] = btl;
|
|
}
|
|
mca_btl_elan_component.elan_num_btls = count;
|
|
/* Publish the network positions for the current node */
|
|
ompi_modex_send( &mca_btl_elan_component.super.btl_version, positions,
|
|
count * sizeof(int));
|
|
}
|
|
|
|
if(mca_btl_elan_component.elan_num_btls) {
|
|
btls = (mca_btl_base_module_t**)malloc( mca_btl_elan_component.elan_num_btls *
|
|
sizeof(mca_btl_base_module_t*) );
|
|
if( NULL == btls ) {
|
|
free( mca_btl_elan_component.elan_btls );
|
|
mca_btl_elan_component.elan_num_btls = 0; /* no active BTL modules */
|
|
return NULL;
|
|
}
|
|
memcpy( btls, mca_btl_elan_component.elan_btls,
|
|
mca_btl_elan_component.elan_num_btls * sizeof(mca_btl_elan_module_t*) );
|
|
}
|
|
*num_btl_modules = mca_btl_elan_component.elan_num_btls;
|
|
return btls;
|
|
}
|
|
|
|
/*
|
|
* Elan4 component progress.
|
|
*/
|
|
int mca_btl_elan_component_progress( void )
|
|
{
|
|
int num_progressed = 0, i;
|
|
|
|
for( i = 0; i < (int)mca_btl_elan_component.elan_num_btls; i++ ) {
|
|
mca_btl_elan_module_t* elan_btl = mca_btl_elan_component.elan_btls[i];
|
|
|
|
/* This is a fast receive over the queue */
|
|
if( elan_queueRxPoll( elan_btl->rx_queue, 0 ) ) {
|
|
mca_btl_active_message_callback_t* reg;
|
|
mca_btl_elan_hdr_t* elan_hdr = NULL;
|
|
mca_btl_elan_frag_t frag;
|
|
|
|
elan_hdr = (mca_btl_elan_hdr_t*)elan_queueRxWait( elan_btl->rx_queue, NULL, 0 );
|
|
frag.base.des_dst = &frag.segment;
|
|
frag.base.des_dst->seg_addr.pval = (void*)(elan_hdr+1);
|
|
frag.base.des_dst->seg_len = (size_t)elan_hdr->length;
|
|
frag.base.des_dst_cnt = 1;
|
|
frag.tag = (mca_btl_base_tag_t)elan_hdr->tag;
|
|
frag.size = elan_hdr->length;
|
|
|
|
reg = mca_btl_base_active_message_trigger + frag.tag;
|
|
reg->cbfunc( &(elan_btl->super), frag.tag, &(frag.base), reg->cbdata );
|
|
elan_queueRxComplete( elan_btl->rx_queue );
|
|
num_progressed++;
|
|
}
|
|
/* This is the slower receive over the tport */
|
|
if(elan_btl->expect_tport_recv && !OPAL_THREAD_TRYLOCK(&elan_btl->elan_lock)) {
|
|
mca_btl_elan_frag_t* frag = (mca_btl_elan_frag_t*)opal_list_get_first( &(elan_btl->recv_list) );
|
|
if( elan_done(frag->elan_event, 0) ) {
|
|
int tag;
|
|
size_t length;
|
|
mca_btl_active_message_callback_t* reg;
|
|
void* recv_buf;
|
|
recv_buf = (mca_btl_elan_hdr_t*)elan_tportRxWait( frag->elan_event,
|
|
NULL, &tag, &length );
|
|
num_progressed++;
|
|
/*elan_btl->expect_tport_recv--;*/
|
|
|
|
opal_list_remove_first( &(elan_btl->recv_list) );
|
|
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
|
|
|
|
frag->base.des_dst->seg_addr.pval = (void*)recv_buf;
|
|
frag->base.des_dst->seg_len = length;
|
|
frag->tag = (mca_btl_base_tag_t)tag;
|
|
reg = mca_btl_base_active_message_trigger + frag->tag;
|
|
reg->cbfunc( &(elan_btl->super), frag->tag, &(frag->base), reg->cbdata );
|
|
if( recv_buf != (void*)(frag+1) ) {
|
|
elan_tportBufFree( elan_btl->tport, recv_buf );
|
|
frag->base.des_dst->seg_addr.pval = (void*)(frag+1);
|
|
}
|
|
|
|
frag->elan_event = elan_tportRxStart( elan_btl->tport,
|
|
ELAN_TPORT_RXBUF | ELAN_TPORT_RXANY,
|
|
0, 0, 0, 0,
|
|
frag->base.des_dst->seg_addr.pval,
|
|
mca_btl_elan_module.super.btl_eager_limit );
|
|
OPAL_THREAD_LOCK(&elan_btl->elan_lock);
|
|
opal_list_append( &(elan_btl->recv_list), (opal_list_item_t*)frag );
|
|
}
|
|
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
|
|
}
|
|
/* If there are any pending sends check their completion */
|
|
recheck_send_list:
|
|
if( !opal_list_is_empty( &(elan_btl->send_list) ) && !OPAL_THREAD_TRYLOCK(&elan_btl->elan_lock) ) {
|
|
mca_btl_elan_frag_t* frag = (mca_btl_elan_frag_t*)opal_list_get_first( &(elan_btl->send_list) );
|
|
if( (NULL != frag) && elan_poll(frag->elan_event, 0) ) {
|
|
int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP );
|
|
|
|
opal_list_remove_first( &(elan_btl->send_list) );
|
|
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
|
|
num_progressed++;
|
|
|
|
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint,
|
|
&(frag->base), OMPI_SUCCESS );
|
|
if( btl_ownership ) {
|
|
MCA_BTL_ELAN_FRAG_RETURN(frag);
|
|
}
|
|
goto recheck_send_list;
|
|
} else {
|
|
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
|
|
}
|
|
}
|
|
recheck_rdma_list:
|
|
/* If any RDMA have been posted, check their status */
|
|
if( !opal_list_is_empty( &(elan_btl->rdma_list) ) && !OPAL_THREAD_TRYLOCK(&elan_btl->elan_lock) ) {
|
|
mca_btl_elan_frag_t* frag = (mca_btl_elan_frag_t*)opal_list_get_first( &(elan_btl->rdma_list) );
|
|
if( (NULL != frag) && elan_poll(frag->elan_event, 0) ) {
|
|
int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP );
|
|
|
|
opal_list_remove_first( &(elan_btl->rdma_list) );
|
|
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
|
|
num_progressed++;
|
|
|
|
frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint,
|
|
&(frag->base), OMPI_SUCCESS );
|
|
if( btl_ownership ) {
|
|
MCA_BTL_ELAN_FRAG_RETURN(frag);
|
|
}
|
|
goto recheck_rdma_list;
|
|
} else {
|
|
OPAL_THREAD_UNLOCK(&elan_btl->elan_lock);
|
|
}
|
|
}
|
|
}
|
|
|
|
return num_progressed;
|
|
}
|