1
1
openmpi/ompi/mca/ptl/gm/ptl_gm_component.c
Jeff Squyres ba99409628 Major simplifications to component versioning:
- After long discussions and ruminations on how we run components in
  LAM/MPI, made the decision that, by default, all components included
  in Open MPI will use the version number of their parent project
  (i.e., OMPI or ORTE).  They are certaint free to use a different
  number, but this simplification makes the common cases easy:
  - components are only released when the parent project is released
  - it is easy (trivial?) to distinguish which version component goes
    with with version of the parent project
- removed all autogen/configure code for templating the version .h
  file in components
- made all ORTE components use ORTE_*_VERSION for version numbers
- made all OMPI components use OMPI_*_VERSION for version numbers
- removed all VERSION files from components
- configure now displays OPAL, ORTE, and OMPI version numbers
- ditto for ompi_info
- right now, faking it -- OPAL and ORTE and OMPI will always have the
  same version number (i.e., they all come from the same top-level
  VERSION file).  But this paves the way for the Great Configure
  Reorganization, where, among other things, each project will have
  its own version number.

So all in all, we went from a boatload of version numbers to
[effectively] three.  That's pretty good.  :-)

This commit was SVN r6344.
2005-07-04 20:12:36 +00:00

600 строки
21 KiB
C

/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004 The Ohio State University.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "include/constants.h"
#include "opal/util/output.h"
#include "mca/ptl/ptl.h"
#include "mca/pml/base/pml_base_module_exchange.h"
#include "ptl_gm.h"
#include "ptl_gm_priv.h"
#include "ptl_gm_peer.h"
#include "ptl_gm_sendfrag.h"
mca_ptl_gm_component_t mca_ptl_gm_component = {
{
/* First, the mca_base_component_t struct containing meta information
about the component itself */
{
/* Indicate that we are a pml v1.0.0 component (which also implies a
specific MCA version) */
MCA_PTL_BASE_VERSION_1_0_0,
"gm", /* MCA component name */
OMPI_MAJOR_VERSION, /* MCA component major version */
OMPI_MINOR_VERSION, /* MCA component minor version */
OMPI_RELEASE_VERSION, /* MCA component release version */
mca_ptl_gm_component_open, /* component open */
mca_ptl_gm_component_close /* component close */
}
,
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
false
},
mca_ptl_gm_component_init,
mca_ptl_gm_component_control,
mca_ptl_gm_component_progress
}
};
/*
* utility routines for parameter registration
*/
static inline char *
mca_ptl_gm_param_register_string( const char *param_name,
const char *default_value )
{
char *param_value;
int id = mca_base_param_register_string( "ptl", "gm", param_name, NULL,
default_value) ;
mca_base_param_lookup_string (id, &param_value);
return param_value;
}
static inline int
mca_ptl_gm_param_register_int( const char *param_name, int default_value )
{
int id =
mca_base_param_register_int ("ptl", "gm", param_name, NULL,
default_value);
int param_value = default_value;
mca_base_param_lookup_int (id, &param_value);
return param_value;
}
/*
* Called by MCA framework to open the module, registers
* module parameters.
*/
int
mca_ptl_gm_component_open(void)
{
uint32_t default_first_frag_size;
/* initialize state */
mca_ptl_gm_component.gm_ptl_modules = NULL;
mca_ptl_gm_component.gm_num_ptl_modules = 0;
/* initialize objects */
OBJ_CONSTRUCT (&mca_ptl_gm_component.gm_lock, opal_mutex_t);
OBJ_CONSTRUCT (&mca_ptl_gm_component.gm_procs, opal_list_t);
OBJ_CONSTRUCT (&mca_ptl_gm_component.gm_send_req, opal_list_t);
/* register GM component parameters */
mca_ptl_gm_component.gm_port_name =
mca_ptl_gm_param_register_string( "port_name", "OMPI_GM" );
mca_ptl_gm_component.gm_max_port_number =
mca_ptl_gm_param_register_int ("max_ports_number", 16 );
mca_ptl_gm_component.gm_max_boards_number =
mca_ptl_gm_param_register_int ("max_boards_number", 4 );
mca_ptl_gm_component.gm_max_ptl_modules =
mca_ptl_gm_param_register_int( "max_ptl_modules", 1 );
mca_ptl_gm_component.gm_segment_size =
mca_ptl_gm_param_register_int( "segment_size", 32 * 1024 );
default_first_frag_size = mca_ptl_gm_component.gm_segment_size - sizeof(mca_ptl_base_rendezvous_header_t);
mca_ptl_gm_module.super.ptl_first_frag_size =
mca_ptl_gm_param_register_int ("first_frag_size", default_first_frag_size );
/* the first_frag_size should be always less than the gm_segment_size by at least the
* header sizeof.
*/
if( mca_ptl_gm_module.super.ptl_first_frag_size > default_first_frag_size ) {
mca_ptl_gm_module.super.ptl_first_frag_size = default_first_frag_size;
}
mca_ptl_gm_module.super.ptl_min_frag_size =
mca_ptl_gm_param_register_int ("min_frag_size", 64 * 1024);
mca_ptl_gm_module.super.ptl_max_frag_size =
mca_ptl_gm_param_register_int ("max_frag_size", 256 * 1024 * 1024);
/* Parameters setting the message limits. */
mca_ptl_gm_component.gm_eager_limit =
mca_ptl_gm_param_register_int( "eager_limit", 128 * 1024 );
#if OMPI_MCA_PTL_GM_HAVE_RDMA_GET
mca_ptl_gm_component.gm_rndv_burst_limit =
mca_ptl_gm_param_register_int( "rndv_burst_limit", 512 * 1024 );
mca_ptl_gm_component.gm_rdma_frag_size =
mca_ptl_gm_param_register_int ("rdma_frag_size", 128 * 1024);
#endif /* OMPI_MCA_PTL_GM_HAVE_RDMA_GET */
mca_ptl_gm_component.gm_free_list_num =
mca_ptl_gm_param_register_int ("free_list_num", 256);
mca_ptl_gm_component.gm_free_list_inc =
mca_ptl_gm_param_register_int ("free_list_inc", 32);
return OMPI_SUCCESS;
}
/*
* component close
*/
int mca_ptl_gm_component_close (void)
{
uint32_t index;
mca_ptl_base_module_t* ptl;
for( index = 0; index < mca_ptl_gm_component.gm_num_ptl_modules; index++ ) {
ptl = (mca_ptl_base_module_t*)mca_ptl_gm_component.gm_ptl_modules[index];
if( NULL != ptl )
ptl->ptl_finalize( ptl );
}
mca_ptl_gm_component.gm_num_ptl_modules = 0;
if (NULL != mca_ptl_gm_component.gm_ptl_modules)
free (mca_ptl_gm_component.gm_ptl_modules);
OBJ_DESTRUCT( &mca_ptl_gm_component.gm_procs );
OBJ_DESTRUCT( &mca_ptl_gm_component.gm_send_req );
OBJ_DESTRUCT( &mca_ptl_gm_component.gm_lock );
return OMPI_SUCCESS;
}
/*
* Create a ptl instance and add to components list.
*/
static int
mca_ptl_gm_create( mca_ptl_gm_module_t** pptl )
{
mca_ptl_gm_module_t *ptl;
ptl = (mca_ptl_gm_module_t *)malloc( sizeof(mca_ptl_gm_module_t) );
if (NULL == ptl) {
opal_output( 0, " ran out of resource to allocate ptl_instance \n" );
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* copy the basic informations in the new PTL */
memcpy (ptl, &mca_ptl_gm_module, sizeof(mca_ptl_gm_module_t) );
#if OMPI_HAVE_POSIX_THREADS
ptl->thread.t_handle = (pthread_t)-1;
#endif /* OMPI_HAVE_POSIX_THREADS */
*pptl = ptl;
return OMPI_SUCCESS;
}
/*
* Register GM component addressing information. The MCA framework
* will make this available to all peers.
*/
static int
mca_ptl_gm_module_store_data_toexchange (void)
{
int rc;
size_t i;
size_t size;
mca_ptl_gm_addr_t *addrs;
size = mca_ptl_gm_component.gm_num_ptl_modules * sizeof (mca_ptl_gm_addr_t);
addrs = (mca_ptl_gm_addr_t *)malloc (size);
if (NULL == addrs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < mca_ptl_gm_component.gm_num_ptl_modules; i++) {
mca_ptl_gm_module_t *ptl = mca_ptl_gm_component.gm_ptl_modules[i];
addrs[i].local_id = ptl->local_addr.local_id;
#if GM_API_VERSION > 0x200
addrs[i].global_id = ptl->local_addr.global_id;
#else
strncpy( addrs[i].global_id, ptl->local_addr.global_id, GM_MAX_HOST_NAME_LEN );
#endif /* GM_API_VERSION > 0x200 */
addrs[i].port_id = ptl->local_addr.port_id;
}
rc = mca_base_modex_send (&mca_ptl_gm_component.super.ptlm_version, addrs,
size);
free (addrs);
return rc;
}
#if OMPI_HAVE_POSIX_THREADS
static void*
mca_ptl_gm_thread_progress( opal_thread_t* thread )
{
gm_recv_event_t *event;
mca_ptl_gm_module_t* ptl = thread->t_arg;
/* This thread enter in a cancel enabled state */
pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, NULL );
while(1) {
event = gm_blocking_receive(ptl->gm_port);
if( GM_NO_RECV_EVENT != gm_ntohc(event->recv.type) )
mca_ptl_gm_analyze_recv_event( ptl, event );
}
return PTHREAD_CANCELED;
}
#endif /* OMPI_HAVE_POSIX_THREADS */
/* Scan all ports on the boards. As it's difficult to find the total number of boards
* we use a predefined maximum.
* Return the number of discovered boards where opening a port was a succesfull operation.
*/
static int32_t
mca_ptl_gm_discover_boards( mca_ptl_gm_module_t** pptl,
uint32_t max_ptls, uint32_t max_boards, uint32_t max_port )
{
uint32_t board_no, port_no, index = 0, local_id;
struct gm_port* gm_port;
#if GM_API_VERSION > 0x200
uint32_t global_id;
#else
char global_id[GM_MAX_HOST_NAME_LEN];
#endif /* GM_API_VERSION > 0x200 */
for( board_no = 0; board_no < max_boards; board_no++ ) {
/* open the first available gm port for this board */
for( port_no = 2; port_no < max_port; port_no++ ) {
if (3 == port_no) {
continue; /* port 0,1,3 reserved */
} else if (GM_SUCCESS ==
gm_open(&gm_port, board_no, port_no,
mca_ptl_gm_component.gm_port_name,
OMPI_MCA_PTL_GM_API_VERSION) ) {
break;
}
}
if( port_no == max_port ) {
continue;
}
/* Get node local Id */
if( GM_SUCCESS != gm_get_node_id( gm_port, &local_id) ) {
opal_output (0, " failure to get local_id \n");
continue;
}
/* Gather an unique id for the node */
#if GM_API_VERSION > 0x200
if (GM_SUCCESS != gm_node_id_to_global_id( gm_port, local_id, &global_id) ) {
opal_output (0, " Error: Unable to get my GM global unique id \n");
continue;
}
#else
{
if( GM_SUCCESS != gm_get_host_name( gm_port, global_id ) ) {
opal_output( 0, "Error: Unable to get the GM host name\n" );
continue;
}
}
#endif /* GM_API_VERSION > 0x200 */
/* Create the ptl. If fail return the number of already created */
if( OMPI_SUCCESS != mca_ptl_gm_create( &(pptl[index]) ) ) {
return index;
}
pptl[index]->gm_port = gm_port;
pptl[index]->local_addr.port_id = port_no;
pptl[index]->local_addr.local_id = local_id;
#if GM_API_VERSION > 0x200
pptl[index]->local_addr.global_id = global_id;
#else
strncpy( pptl[index]->local_addr.global_id, global_id, GM_MAX_HOST_NAME_LEN );
#endif /* GM_API_VERSION > 0x200 */
/* everything is OK let's mark it as usable and go to the next one */
if( (++index) >= max_ptls ) {
break;
}
}
return index;
}
static int
mca_ptl_gm_init_sendrecv (mca_ptl_gm_module_t * ptl)
{
uint32_t i;
mca_ptl_gm_send_frag_t *sfragment;
mca_ptl_gm_recv_frag_t *free_rfragment;
ptl->num_send_tokens = gm_num_send_tokens (ptl->gm_port);
ptl->max_send_tokens = ptl->num_send_tokens;
ptl->num_send_tokens -= PTL_GM_ADMIN_SEND_TOKENS;
ptl->num_recv_tokens = gm_num_receive_tokens (ptl->gm_port);
ptl->max_recv_tokens = ptl->num_recv_tokens;
ptl->num_recv_tokens -= PTL_GM_ADMIN_RECV_TOKENS;
/****************SEND****************************/
/* construct a list of send fragments */
OBJ_CONSTRUCT (&(ptl->gm_send_frags), ompi_free_list_t);
OBJ_CONSTRUCT (&(ptl->gm_send_dma_frags), ompi_free_list_t);
OBJ_CONSTRUCT (&(ptl->gm_send_frags_queue), opal_list_t);
/* We need a free list just to handle the send fragment that we provide.
* Just to make sure that we dont waste memory, we dont allow this list to
* grow anymore.
*/
ompi_free_list_init( &(ptl->gm_send_frags),
sizeof (mca_ptl_gm_send_frag_t),
OBJ_CLASS (mca_ptl_gm_send_frag_t),
0, /* do not allocate any items I'll provide them */
0, /* maximum number of list allocated elements will be zero */
0,
NULL ); /* not using mpool */
/* A free list containing all DMA allocate memory.
* This free list does not have the right to allocate any new item
* as they should be allocated with a special GM function.
*/
ompi_free_list_init( &(ptl->gm_send_dma_frags),
mca_ptl_gm_component.gm_segment_size,
OBJ_CLASS (opal_list_item_t),
0, /* do not allocate any items I'll provide them */
0, /* maximum number of list allocated elements will be zero */
0,
NULL ); /* not using mpool */
/* allocate the elements */
sfragment = (mca_ptl_gm_send_frag_t *)calloc( ptl->num_send_tokens, sizeof(mca_ptl_gm_send_frag_t) );
ptl->gm_send_fragments = sfragment;
/* allocate the registered memory */
ptl->gm_send_dma_memory = gm_dma_malloc( ptl->gm_port,
(mca_ptl_gm_component.gm_segment_size * ptl->num_send_tokens) + GM_PAGE_LEN );
if( NULL == ptl->gm_send_dma_memory ) {
opal_output( 0, "unable to allocate registered memory\n" );
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < ptl->num_send_tokens; i++) {
sfragment->send_buf = NULL;
OMPI_GM_FREE_LIST_RETURN( &(ptl->gm_send_frags), (opal_list_item_t*)sfragment );
OMPI_GM_FREE_LIST_RETURN( &(ptl->gm_send_dma_frags),
(opal_list_item_t*)((char*)ptl->gm_send_dma_memory +
i * mca_ptl_gm_component.gm_segment_size) );
sfragment++;
}
/*****************RECEIVE*****************************/
/* allow remote memory access */
if( GM_SUCCESS != gm_allow_remote_memory_access (ptl->gm_port) ) {
opal_output (0, "unable to allow remote memory access\n");
}
OBJ_CONSTRUCT (&(ptl->gm_recv_outstanding_queue), opal_list_t);
/* construct the list of recv fragments free */
OBJ_CONSTRUCT (&(ptl->gm_recv_frags_free), ompi_free_list_t);
ompi_free_list_init( &(ptl->gm_recv_frags_free),
sizeof (mca_ptl_gm_recv_frag_t),
OBJ_CLASS (mca_ptl_gm_recv_frag_t),
0, /* by default I will provide all items */
ptl->num_recv_tokens * 10, /* the maximum number of items in the free list */
ptl->num_recv_tokens, /* if it need to allocate some more */
NULL );
/* allocate the elements */
free_rfragment = (mca_ptl_gm_recv_frag_t *)
calloc( ptl->num_recv_tokens, sizeof(mca_ptl_gm_recv_frag_t) );
ptl->gm_recv_fragments = free_rfragment;
/*allocate the registered memory */
ptl->gm_recv_dma_memory =
gm_dma_malloc( ptl->gm_port, (mca_ptl_gm_component.gm_segment_size * ptl->num_recv_tokens) + GM_PAGE_LEN );
if( NULL == ptl->gm_recv_dma_memory ) {
opal_output( 0, "unable to allocate registered memory for receive\n" );
return OMPI_ERR_OUT_OF_RESOURCE;
}
for( i = 0; i < 2; i++ ) {
OMPI_GM_FREE_LIST_RETURN( &(ptl->gm_recv_frags_free), (opal_list_item_t *)free_rfragment );
free_rfragment++;
gm_provide_receive_buffer( ptl->gm_port, (char*)ptl->gm_recv_dma_memory + i * mca_ptl_gm_component.gm_segment_size,
GM_SIZE, GM_HIGH_PRIORITY );
}
for( i = 2; i < ptl->num_recv_tokens; i++ ) {
OMPI_GM_FREE_LIST_RETURN( &(ptl->gm_recv_frags_free), (opal_list_item_t *)free_rfragment );
free_rfragment++;
gm_provide_receive_buffer( ptl->gm_port, (char*)ptl->gm_recv_dma_memory + i * mca_ptl_gm_component.gm_segment_size,
GM_SIZE, GM_LOW_PRIORITY );
}
OBJ_CONSTRUCT( &(ptl->gm_pending_acks), opal_list_t );
return OMPI_SUCCESS;
}
static int
mca_ptl_gm_init( mca_ptl_gm_component_t * gm )
{
uint32_t index;
mca_ptl_gm_module_t* ptl;
uint32_t save_counter;
/* let's try to find if GM is available */
if( GM_SUCCESS != gm_init() ) {
opal_output( 0, "[%s:%d] error in initializing the gm library\n", __FILE__, __LINE__ );
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* First discover all available boards. For each board we will create a unique PTL */
mca_ptl_gm_component.gm_ptl_modules = calloc( mca_ptl_gm_component.gm_max_ptl_modules,
sizeof (mca_ptl_gm_module_t *));
if (NULL == mca_ptl_gm_component.gm_ptl_modules) {
opal_output( 0, "[%s:%d] error in initializing the gm PTL's.\n", __FILE__, __LINE__ );
return OMPI_ERR_OUT_OF_RESOURCE;
}
mca_ptl_gm_component.gm_num_ptl_modules =
mca_ptl_gm_discover_boards( mca_ptl_gm_component.gm_ptl_modules,
mca_ptl_gm_component.gm_max_ptl_modules,
mca_ptl_gm_component.gm_max_boards_number,
mca_ptl_gm_component.gm_max_port_number );
/* In the case when we are in a multi-threaded environment each
* PTL will have its own thread. At this point all structures are
* correctly initialized, each thread will grab one and use it.
*/
for( index = 0; index < mca_ptl_gm_component.gm_num_ptl_modules; index++ ) {
ptl = mca_ptl_gm_component.gm_ptl_modules[index];
/* Now prepost some received and allocate some sends. After
* this step the PTL is fully initialized.
*/
if( OMPI_SUCCESS != mca_ptl_gm_init_sendrecv( ptl ) )
break;
if( opal_using_threads() ) {
#if OMPI_HAVE_POSIX_THREADS
ptl->thread.t_run = (opal_thread_fn_t)mca_ptl_gm_thread_progress;
ptl->thread.t_arg = (void*)ptl;
#endif /* OMPI_HAVE_POSIX_THREADS */
if( OMPI_SUCCESS != opal_thread_start( &(ptl->thread) ) ) {
break;
}
}
}
save_counter = index;
/* If we are unable to start all the required threads we update the total
* number of threads and call finalize for the others PTLs.
*/
for( ; index < mca_ptl_gm_component.gm_num_ptl_modules; index++ ) {
mca_ptl_base_module_t* ptl = (mca_ptl_base_module_t*)mca_ptl_gm_component.gm_ptl_modules[index];
ptl->ptl_finalize( ptl );
}
mca_ptl_gm_component.gm_num_ptl_modules = save_counter;
/* A free list containing all memory used for keep data for unexpected requests. */
OBJ_CONSTRUCT( &(mca_ptl_gm_component.gm_unexpected_frags_data), ompi_free_list_t );
ompi_free_list_init( &(mca_ptl_gm_component.gm_unexpected_frags_data),
mca_ptl_gm_component.gm_segment_size,
OBJ_CLASS (opal_list_item_t),
16, /* keep is small in the begining */
128, /* maximum number of list elements */
16, /* Number of elements to grow by per allocation */
NULL ); /* not using mpool */
#if OMPI_MCA_PTL_GM_CACHE_ENABLE
mca_ptl_gm_regcache_init();
#endif /* OMPI_MCA_PTL_GM_CACHE_ENABLE */
return (mca_ptl_gm_component.gm_num_ptl_modules > 0 ? OMPI_SUCCESS : OMPI_ERR_OUT_OF_RESOURCE);
}
/*
* Initialize the GM component,
* check how many boards are available and open ports on them.
*/
mca_ptl_base_module_t **
mca_ptl_gm_component_init (int *num_ptl_modules,
bool enable_progress_threads,
bool enable_mpi_threads)
{
mca_ptl_base_module_t **ptls;
*num_ptl_modules = 0;
if (OMPI_SUCCESS != mca_ptl_gm_init (&mca_ptl_gm_component)) {
/*opal_output( 0, "[%s:%d] error in initializing gm state and PTL's. (%d PTL's)\n",
__FILE__, __LINE__, mca_ptl_gm_component.gm_num_ptl_modules );*/
return NULL;
}
/* publish GM parameters with the MCA framework */
if (OMPI_SUCCESS != mca_ptl_gm_module_store_data_toexchange ())
return 0;
/* return array of PTLs */
ptls = (mca_ptl_base_module_t**) malloc (
mca_ptl_gm_component.gm_num_ptl_modules * sizeof(mca_ptl_base_module_t *));
if (NULL == ptls) {
return NULL;
}
memcpy (ptls, mca_ptl_gm_component.gm_ptl_modules,
mca_ptl_gm_component.gm_num_ptl_modules * sizeof(mca_ptl_gm_module_t *));
*num_ptl_modules = mca_ptl_gm_component.gm_num_ptl_modules;
return ptls;
}
/*
* GM module control
*/
int
mca_ptl_gm_component_control (int param, void *value, size_t size)
{
return OMPI_SUCCESS;
}
char* gm_get_local_buffer( void )
{
opal_list_item_t* item;
int rc;
OMPI_FREE_LIST_WAIT( &(mca_ptl_gm_component.gm_unexpected_frags_data), item, rc );
return (char*)item;
}
void gm_release_local_buffer( char* ptr )
{
OMPI_GM_FREE_LIST_RETURN( &(mca_ptl_gm_component.gm_unexpected_frags_data), (opal_list_item_t*)ptr );
}
/*
* GM module progress.
*/
int
mca_ptl_gm_component_progress (mca_ptl_tstamp_t tstamp)
{
uint32_t i;
gm_recv_event_t *event;
mca_ptl_gm_module_t *ptl;
for( i = 0; i < mca_ptl_gm_component.gm_num_ptl_modules;) {
ptl = mca_ptl_gm_component.gm_ptl_modules[i];
event = gm_receive(ptl->gm_port);
/* If there are no receive events just skip the function call */
if( GM_NO_RECV_EVENT != gm_ntohc(event->recv.type) ) {
if( 1 == mca_ptl_gm_analyze_recv_event( ptl, event ) ) {
/* we try to empty the GM event queue */
continue;
}
}
i++;
}
return OMPI_SUCCESS;
}