/* * Copyright (c) 2004-2008 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include "ompi/constants.h" #include "opal/mca/event/event.h" #include "opal/util/output.h" #include "ompi/mca/btl/btl.h" #include "opal/mca/base/mca_base_param.h" #include "ompi/runtime/ompi_module_exchange.h" #include "ompi/mca/mpool/base/base.h" #include "btl_elan.h" #include "btl_elan_frag.h" #include "btl_elan_endpoint.h" #include "ompi/mca/btl/base/base.h" #include "ompi/mca/btl/base/btl_base_error.h" #include "elan/elan.h" #include "opal/util/opal_environ.h" #define ELAN_MAX_BTL 10 mca_btl_elan_component_t mca_btl_elan_component = { { /* First, the mca_base_component_t struct containing meta information about the component itself */ { MCA_BTL_BASE_VERSION_2_0_0, "elan", /* MCA component name */ OMPI_MAJOR_VERSION, /* MCA component major version */ OMPI_MINOR_VERSION, /* MCA component minor version */ OMPI_RELEASE_VERSION, /* MCA component release version */ mca_btl_elan_component_open, /* component open */ mca_btl_elan_component_close /* component close */ }, { /* The component is not checkpoint ready */ MCA_BASE_METADATA_PARAM_NONE }, mca_btl_elan_component_init, mca_btl_elan_component_progress, } }; /* * utility routines for parameter registration */ static inline int mca_btl_elan_param_register_int( const char* param_name, int default_value ) { int id = mca_base_param_register_int("btl","elan",param_name,NULL,default_value); int param_value = default_value; mca_base_param_lookup_int(id,¶m_value); return param_value; } /* * Called by MCA framework to open the component, registers * component parameters. */ int mca_btl_elan_component_open(void) { /* initialize state */ mca_btl_elan_component.elan_num_btls = 0; mca_btl_elan_component.elan_btls = NULL; mca_btl_elan_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT; mca_btl_elan_module.super.btl_eager_limit = 32*1024; mca_btl_elan_module.super.btl_rndv_eager_limit = mca_btl_elan_module.super.btl_eager_limit; mca_btl_elan_module.super.btl_max_send_size = 64*1024; /*64*1024;*/ mca_btl_elan_module.super.btl_rdma_pipeline_send_length = 512 * 1024; mca_btl_elan_module.super.btl_rdma_pipeline_frag_size = 128 * 1024; mca_btl_elan_module.super.btl_min_rdma_pipeline_size = 128 * 1024; mca_btl_elan_module.super.btl_flags = MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND; mca_btl_elan_module.super.btl_bandwidth = 1959; mca_btl_elan_module.super.btl_latency = 4; mca_btl_base_param_register(&mca_btl_elan_component.super.btl_version, &mca_btl_elan_module.super); mca_base_param_reg_string( (mca_base_component_t*)&mca_btl_elan_component, "elanidmap", "System-wide configuration file for the Quadrics network (elanidmap)", false, false, "/etc/elanidmap", &mca_btl_elan_component.elanidmap_file ); mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_elan_component, "max_posted_recv", "Number of received posted in advance. Increasing this number for" " communication bound application can lead to visible improvement" " in performances", false, false, 128, &mca_btl_elan_component.elan_max_posted_recv ); /* register Elan4 component parameters */ mca_btl_elan_component.elan_free_list_num = mca_btl_elan_param_register_int( "free_list_num", 8 ); mca_btl_elan_component.elan_free_list_max = mca_btl_elan_param_register_int( "free_list_max", (mca_btl_elan_component.elan_free_list_num + mca_btl_elan_component.elan_max_posted_recv) ); mca_btl_elan_component.elan_free_list_inc = mca_btl_elan_param_register_int( "free_list_inc", 32 ); return OMPI_SUCCESS; } /* * component cleanup - sanity checking of queue lengths */ int mca_btl_elan_component_close(void) { if( NULL != mca_btl_elan_component.elan_btls ) { free( mca_btl_elan_component.elan_btls ); mca_btl_elan_component.elan_btls = NULL; mca_btl_elan_component.elan_num_btls = 0; /* release resources */ OBJ_DESTRUCT(&mca_btl_elan_component.elan_procs); OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_eager); OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_user); OBJ_DESTRUCT(&mca_btl_elan_component.elan_frag_max); OBJ_DESTRUCT(&mca_btl_elan_component.elan_lock); } return OMPI_SUCCESS; } /* * Elan4 component initialization: * (1) read interface list from kernel and compare against component parameters * then create a BTL instance for selected interfaces * (2) setup Elan4 listen socket for incoming connection attempts * (3) register BTL parameters with the MCA */ mca_btl_base_module_t** mca_btl_elan_component_init( int *num_btl_modules, bool enable_progress_threads, bool enable_mpi_threads ) { mca_btl_base_module_t** btls = NULL; *num_btl_modules = 0; /* There is no support for a progress thread yet. */ if (enable_progress_threads) { ompi_modex_send(&mca_btl_elan_component.super.btl_version, NULL, 0); return NULL; } OBJ_CONSTRUCT (&mca_btl_elan_component.elan_lock, opal_mutex_t); OBJ_CONSTRUCT (&mca_btl_elan_component.elan_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT (&mca_btl_elan_component.elan_frag_max, ompi_free_list_t); OBJ_CONSTRUCT (&mca_btl_elan_component.elan_frag_user, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_elan_component.elan_procs, opal_list_t); ompi_free_list_init_new( &mca_btl_elan_component.elan_frag_eager, sizeof(mca_btl_elan_frag_t) + mca_btl_elan_module.super.btl_eager_limit, opal_cache_line_size, OBJ_CLASS(mca_btl_elan_frag_t), 0,opal_cache_line_size, mca_btl_elan_component.elan_free_list_num, mca_btl_elan_component.elan_free_list_max, mca_btl_elan_component.elan_free_list_inc, NULL ); /* use default allocator */ ompi_free_list_init_new( &mca_btl_elan_component.elan_frag_user, sizeof(mca_btl_elan_frag_t), opal_cache_line_size, OBJ_CLASS(mca_btl_elan_frag_t), 0,opal_cache_line_size, mca_btl_elan_component.elan_free_list_num, mca_btl_elan_component.elan_free_list_max, mca_btl_elan_component.elan_free_list_inc, NULL ); /* use default allocator */ ompi_free_list_init_new( &mca_btl_elan_component.elan_frag_max, sizeof(mca_btl_elan_frag_t)+mca_btl_elan_module.super.btl_max_send_size, opal_cache_line_size, OBJ_CLASS(mca_btl_elan_frag_t), 0,opal_cache_line_size, mca_btl_elan_component.elan_free_list_num, mca_btl_elan_component.elan_free_list_max, mca_btl_elan_component.elan_free_list_inc, NULL ); /* use default allocator */ mca_btl_elan_component.elan_num_btls = ELAN_MAX_BTL; /* no more than that */ mca_btl_elan_component.elan_btls = calloc( mca_btl_elan_component.elan_num_btls, sizeof(mca_btl_base_module_t*) ); /* Retrieve the positions of the node in the elan network */ { FILE* position; char filename[255], file_line[255]; int index, count = 0, positions[ELAN_MAX_BTL]; mca_btl_elan_module_t* btl; for( index = 0; index < ELAN_MAX_BTL; index++ ) { snprintf( filename, 255, "/proc/qsnet/elan4/device%d/position", index ); position = fopen( filename, "r" ); if( NULL == position ) { continue; } if( 0 == fscanf( position, "%s%i", file_line, &positions[count] ) ) { opal_output( 0, "Unable to read the network position" ); continue; } fclose(position); btl = (mca_btl_elan_module_t*)malloc (sizeof (mca_btl_elan_module_t)); if(NULL == btl) { opal_output( 0, "No enough memory to allocate the Elan internal structures" ); return NULL; } memcpy( btl, &mca_btl_elan_module, sizeof(mca_btl_elan_module_t) ); OBJ_CONSTRUCT( &btl->elan_lock, opal_mutex_t ); OBJ_CONSTRUCT( &btl->send_list, opal_list_t ); OBJ_CONSTRUCT( &btl->rdma_list, opal_list_t ); OBJ_CONSTRUCT( &btl->recv_list, opal_list_t ); btl->expect_tport_recv = 1; btl->elan_position = positions[count]; mca_btl_elan_component.elan_btls[count++] = btl; } mca_btl_elan_component.elan_num_btls = count; /* Publish the network positions for the current node */ ompi_modex_send( &mca_btl_elan_component.super.btl_version, positions, count * sizeof(int)); } if(mca_btl_elan_component.elan_num_btls) { btls = (mca_btl_base_module_t**)malloc( mca_btl_elan_component.elan_num_btls * sizeof(mca_btl_base_module_t*) ); if( NULL == btls ) { free( mca_btl_elan_component.elan_btls ); mca_btl_elan_component.elan_num_btls = 0; /* no active BTL modules */ return NULL; } memcpy( btls, mca_btl_elan_component.elan_btls, mca_btl_elan_component.elan_num_btls * sizeof(mca_btl_elan_module_t*) ); } *num_btl_modules = mca_btl_elan_component.elan_num_btls; return btls; } /* * Elan4 component progress. */ int mca_btl_elan_component_progress( void ) { int num_progressed = 0, i; for( i = 0; i < (int)mca_btl_elan_component.elan_num_btls; i++ ) { mca_btl_elan_module_t* elan_btl = mca_btl_elan_component.elan_btls[i]; /* This is a fast receive over the queue */ if( elan_queueRxPoll( elan_btl->rx_queue, 0 ) ) { mca_btl_active_message_callback_t* reg; mca_btl_elan_hdr_t* elan_hdr = NULL; mca_btl_elan_frag_t frag; elan_hdr = (mca_btl_elan_hdr_t*)elan_queueRxWait( elan_btl->rx_queue, NULL, 0 ); frag.base.des_dst = &frag.segment; frag.base.des_dst->seg_addr.pval = (void*)(elan_hdr+1); frag.base.des_dst->seg_len = (size_t)elan_hdr->length; frag.base.des_dst_cnt = 1; frag.tag = (mca_btl_base_tag_t)elan_hdr->tag; frag.size = elan_hdr->length; reg = mca_btl_base_active_message_trigger + frag.tag; reg->cbfunc( &(elan_btl->super), frag.tag, &(frag.base), reg->cbdata ); elan_queueRxComplete( elan_btl->rx_queue ); num_progressed++; } /* This is the slower receive over the tport */ if(elan_btl->expect_tport_recv && !OPAL_THREAD_TRYLOCK(&elan_btl->elan_lock)) { mca_btl_elan_frag_t* frag = (mca_btl_elan_frag_t*)opal_list_get_first( &(elan_btl->recv_list) ); if( elan_done(frag->elan_event, 0) ) { int tag; size_t length; mca_btl_active_message_callback_t* reg; void* recv_buf; recv_buf = (mca_btl_elan_hdr_t*)elan_tportRxWait( frag->elan_event, NULL, &tag, &length ); num_progressed++; /*elan_btl->expect_tport_recv--;*/ opal_list_remove_first( &(elan_btl->recv_list) ); OPAL_THREAD_UNLOCK(&elan_btl->elan_lock); frag->base.des_dst->seg_addr.pval = (void*)recv_buf; frag->base.des_dst->seg_len = length; frag->tag = (mca_btl_base_tag_t)tag; reg = mca_btl_base_active_message_trigger + frag->tag; reg->cbfunc( &(elan_btl->super), frag->tag, &(frag->base), reg->cbdata ); if( recv_buf != (void*)(frag+1) ) { elan_tportBufFree( elan_btl->tport, recv_buf ); frag->base.des_dst->seg_addr.pval = (void*)(frag+1); } frag->elan_event = elan_tportRxStart( elan_btl->tport, ELAN_TPORT_RXBUF | ELAN_TPORT_RXANY, 0, 0, 0, 0, frag->base.des_dst->seg_addr.pval, mca_btl_elan_module.super.btl_eager_limit ); OPAL_THREAD_LOCK(&elan_btl->elan_lock); opal_list_append( &(elan_btl->recv_list), (opal_list_item_t*)frag ); } OPAL_THREAD_UNLOCK(&elan_btl->elan_lock); } /* If there are any pending sends check their completion */ recheck_send_list: if( !opal_list_is_empty( &(elan_btl->send_list) ) && !OPAL_THREAD_TRYLOCK(&elan_btl->elan_lock) ) { mca_btl_elan_frag_t* frag = (mca_btl_elan_frag_t*)opal_list_get_first( &(elan_btl->send_list) ); if( (NULL != frag) && elan_poll(frag->elan_event, 0) ) { int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ); opal_list_remove_first( &(elan_btl->send_list) ); OPAL_THREAD_UNLOCK(&elan_btl->elan_lock); num_progressed++; frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS ); if( btl_ownership ) { MCA_BTL_ELAN_FRAG_RETURN(frag); } goto recheck_send_list; } else { OPAL_THREAD_UNLOCK(&elan_btl->elan_lock); } } recheck_rdma_list: /* If any RDMA have been posted, check their status */ if( !opal_list_is_empty( &(elan_btl->rdma_list) ) && !OPAL_THREAD_TRYLOCK(&elan_btl->elan_lock) ) { mca_btl_elan_frag_t* frag = (mca_btl_elan_frag_t*)opal_list_get_first( &(elan_btl->rdma_list) ); if( (NULL != frag) && elan_poll(frag->elan_event, 0) ) { int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ); opal_list_remove_first( &(elan_btl->rdma_list) ); OPAL_THREAD_UNLOCK(&elan_btl->elan_lock); num_progressed++; frag->base.des_cbfunc( &(elan_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS ); if( btl_ownership ) { MCA_BTL_ELAN_FRAG_RETURN(frag); } goto recheck_rdma_list; } else { OPAL_THREAD_UNLOCK(&elan_btl->elan_lock); } } } return num_progressed; }