dbe2798638
are disabled (btl_mx_shared_mem respectively btl_mx_self have to be set in order to activate them). This commit was SVN r12922.
467 строки
19 KiB
C
467 строки
19 KiB
C
/*
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "ompi_config.h"
|
|
#include "ompi/constants.h"
|
|
#include "opal/event/event.h"
|
|
#include "opal/util/opal_environ.h"
|
|
#include "opal/util/if.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/output.h"
|
|
#include "ompi/mca/pml/pml.h"
|
|
#include "ompi/mca/btl/btl.h"
|
|
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "ompi/mca/pml/base/pml_base_module_exchange.h"
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "ompi/mca/mpool/base/base.h"
|
|
#include "btl_mx.h"
|
|
#include "btl_mx_frag.h"
|
|
#include "btl_mx_endpoint.h"
|
|
#include "ompi/mca/btl/base/base.h"
|
|
#include "ompi/mca/btl/base/btl_base_error.h"
|
|
|
|
extern char** environ;
|
|
|
|
mca_btl_mx_component_t mca_btl_mx_component = {
|
|
{
|
|
/* First, the mca_base_component_t struct containing meta information
|
|
about the component itself */
|
|
|
|
{
|
|
/* Indicate that we are a pml v1.0.0 component (which also implies a
|
|
specific MCA version) */
|
|
|
|
MCA_BTL_BASE_VERSION_1_0_1,
|
|
|
|
"mx", /* MCA component name */
|
|
OMPI_MAJOR_VERSION, /* MCA component major version */
|
|
OMPI_MINOR_VERSION, /* MCA component minor version */
|
|
OMPI_RELEASE_VERSION, /* MCA component release version */
|
|
mca_btl_mx_component_open, /* component open */
|
|
mca_btl_mx_component_close /* component close */
|
|
},
|
|
|
|
/* Next the MCA v1.0.0 component meta data */
|
|
|
|
{
|
|
/* Whether the component is checkpointable or not */
|
|
|
|
false
|
|
},
|
|
|
|
mca_btl_mx_component_init,
|
|
mca_btl_mx_component_progress,
|
|
}
|
|
};
|
|
|
|
/*
|
|
* Called by MCA framework to open the component, registers
|
|
* component parameters.
|
|
*/
|
|
|
|
int mca_btl_mx_component_open(void)
|
|
{
|
|
int tmp;
|
|
|
|
/* initialize state */
|
|
mca_btl_mx_component.mx_num_btls = 0;
|
|
mca_btl_mx_component.mx_btls = NULL;
|
|
|
|
/* initialize objects */
|
|
OBJ_CONSTRUCT(&mca_btl_mx_component.mx_procs, opal_list_t);
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "max_btls",
|
|
"Maximum number of accepted Myrinet cards",
|
|
false, false, 1, &mca_btl_mx_component.mx_max_btls );
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "timeout",
|
|
"Timeout for connections",
|
|
false, false, MX_INFINITE, &mca_btl_mx_component.mx_timeout );
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "retries",
|
|
"Number of retries for each new connection before considering the peer as unreacheable",
|
|
false, false, 20, &mca_btl_mx_component.mx_connection_retries );
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "filter",
|
|
"Unique ID for the application (used to connect to the peers)",
|
|
false, false, 0xdeadbeef, &mca_btl_mx_component.mx_filter );
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "self",
|
|
"Enable the MX support for self communications",
|
|
false, false, 0, &mca_btl_mx_component.mx_support_self );
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "shared_mem",
|
|
"Enable the MX support for shared memory",
|
|
false, false, 0, &mca_btl_mx_component.mx_support_sharedmem );
|
|
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "free_list_num",
|
|
"Number of allocated default request",
|
|
false, false, 8, &mca_btl_mx_component.mx_free_list_num );
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "free_list_inc",
|
|
"Number of request we allocate each time we miss some",
|
|
false, false, 32, &mca_btl_mx_component.mx_free_list_inc );
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "free_list_max",
|
|
"Maximum number of request this device is allowed to allocate",
|
|
false, false, 128, &mca_btl_mx_component.mx_free_list_max );
|
|
/* The ompi_free_list has a problem if the (max - num) is not
|
|
* divisible by the increament. So make sure it is ...
|
|
*/
|
|
if( (mca_btl_mx_component.mx_free_list_max - mca_btl_mx_component.mx_free_list_num) %
|
|
mca_btl_mx_component.mx_free_list_inc ) {
|
|
int overhead = (mca_btl_mx_component.mx_free_list_max - mca_btl_mx_component.mx_free_list_num) %
|
|
mca_btl_mx_component.mx_free_list_inc;
|
|
mca_btl_mx_component.mx_free_list_max -= overhead;
|
|
}
|
|
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "max_posted_recv",
|
|
"Number of received posted in advance. Increasing this number for communication bound application can lead to visible improvement in performances",
|
|
false, false, 16, &mca_btl_mx_component.mx_max_posted_recv );
|
|
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "exclusivity",
|
|
"Priority compared with the others devices (used only when several devices are available",
|
|
false, false, 50, (int*) &mca_btl_mx_module.super.btl_exclusivity );
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "first_frag_size",
|
|
"Size of the first fragment for the rendez-vous protocol over MX",
|
|
true, true, 16*1024 - 20, &tmp);
|
|
mca_btl_mx_module.super.btl_eager_limit = tmp;
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "min_send_size",
|
|
"Minimum send fragment size ...",
|
|
false, false, 32*1024 - 40, &tmp);
|
|
mca_btl_mx_module.super.btl_min_send_size = tmp;
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "max_send_size",
|
|
"Maximum send fragment size withour RDMA ...",
|
|
false, false, 128*1024, &tmp);
|
|
mca_btl_mx_module.super.btl_max_send_size = tmp;
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "min_rdma_size",
|
|
"Minimum size of fragment for the RDMA protocol",
|
|
false, false, 1024*1024, &tmp);
|
|
mca_btl_mx_module.super.btl_min_rdma_size = tmp;
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "max_rdma_size",
|
|
"Maximum size of fragment for the RDMA protocol",
|
|
false, false, 1024*1024, &tmp);
|
|
mca_btl_mx_module.super.btl_max_rdma_size = tmp;
|
|
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "flags",
|
|
"Flags to activate/deactivate the RDMA",
|
|
true, false, MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_PUT,
|
|
(int*)&mca_btl_mx_module.super.btl_flags );
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* component cleanup - sanity checking of queue lengths
|
|
*/
|
|
|
|
int mca_btl_mx_component_close(void)
|
|
{
|
|
if( NULL == mca_btl_mx_component.mx_btls )
|
|
return OMPI_SUCCESS;
|
|
|
|
mx_finalize();
|
|
|
|
/* release resources */
|
|
OBJ_DESTRUCT(&mca_btl_mx_component.mx_send_eager_frags);
|
|
OBJ_DESTRUCT(&mca_btl_mx_component.mx_send_user_frags);
|
|
OBJ_DESTRUCT(&mca_btl_mx_component.mx_recv_frags);
|
|
OBJ_DESTRUCT(&mca_btl_mx_component.mx_procs);
|
|
OBJ_DESTRUCT(&mca_btl_mx_component.mx_pending_acks);
|
|
OBJ_DESTRUCT(&mca_btl_mx_component.mx_lock);
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Create and intialize an MX PTL module, where each module
|
|
* represents a specific NIC.
|
|
*/
|
|
|
|
static mca_btl_mx_module_t* mca_btl_mx_create(uint64_t addr)
|
|
{
|
|
mca_btl_mx_module_t* mx_btl;
|
|
mx_return_t status;
|
|
uint32_t nic_id;
|
|
|
|
status = mx_nic_id_to_board_number( addr, &nic_id );
|
|
if( MX_SUCCESS != status ) {
|
|
return NULL;
|
|
}
|
|
|
|
mx_btl = malloc(sizeof(mca_btl_mx_module_t));
|
|
if( NULL == mx_btl ) return NULL;
|
|
|
|
/* copy over default settings */
|
|
memcpy( mx_btl, &mca_btl_mx_module, sizeof(mca_btl_mx_module_t) );
|
|
OBJ_CONSTRUCT( &mx_btl->mx_peers, opal_list_t );
|
|
OBJ_CONSTRUCT( &mx_btl->mx_lock, opal_mutex_t );
|
|
/* open local endpoint */
|
|
status = mx_open_endpoint( nic_id, MX_ANY_ENDPOINT,
|
|
mca_btl_mx_component.mx_filter,
|
|
NULL, 0, &mx_btl->mx_endpoint);
|
|
if(status != MX_SUCCESS) {
|
|
opal_output(0, "mca_btl_mx_init: mx_open_endpoint() failed with status=%d\n", status);
|
|
mca_btl_mx_finalize( &mx_btl->super );
|
|
return NULL;
|
|
}
|
|
|
|
/* query the endpoint address */
|
|
if((status = mx_get_endpoint_addr( mx_btl->mx_endpoint,
|
|
&mx_btl->mx_endpoint_addr)) != MX_SUCCESS) {
|
|
opal_output(0, "mca_btl_mx_init: mx_get_endpoint_addr() failed with status=%d\n", status);
|
|
mca_btl_mx_finalize( &mx_btl->super );
|
|
return NULL;
|
|
}
|
|
|
|
return mx_btl;
|
|
}
|
|
|
|
/*
|
|
* MX component initialization:
|
|
* - check if MX can be initialized.
|
|
* - and construct all static objects.
|
|
*/
|
|
|
|
mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
|
|
bool enable_progress_threads,
|
|
bool enable_mpi_threads)
|
|
{
|
|
mca_btl_base_module_t** btls;
|
|
mx_return_t status;
|
|
uint32_t size, count;
|
|
int32_t i;
|
|
uint64_t *nic_addrs;
|
|
mca_btl_mx_addr_t *mx_addrs;
|
|
|
|
*num_btl_modules = 0;
|
|
|
|
if (enable_progress_threads) {
|
|
opal_output( 0, "mca_btl_mx_component_init: progress threads requested but not supported");
|
|
mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version,
|
|
NULL, 0);
|
|
return NULL;
|
|
}
|
|
|
|
/* set the MX error handle to always return. This function is the only MX function
|
|
* allowed to be called before mx_init in order to make sure that if the MX is not
|
|
* up and running the MX library does not exit the application.
|
|
*/
|
|
mx_set_error_handler(MX_ERRORS_RETURN);
|
|
if( 0 == mca_btl_mx_component.mx_support_sharedmem )
|
|
opal_setenv( "MX_DISABLE_SHMEM", "1", true, &environ );
|
|
if( 0 == mca_btl_mx_component.mx_support_self )
|
|
opal_setenv( "MX_DISABLE_SELF", "1", true, &environ );
|
|
|
|
/* First check if MX is available ... */
|
|
if( MX_SUCCESS != (status = mx_init()) ) {
|
|
opal_output( 0, "mca_btl_mx_component_init: mx_init() failed with status = %d (%s)\n",
|
|
status, mx_strerror(status) );
|
|
mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version,
|
|
NULL, 0);
|
|
return NULL;
|
|
}
|
|
|
|
/* initialize objects */
|
|
OBJ_CONSTRUCT(&mca_btl_mx_component.mx_send_eager_frags, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_mx_component.mx_send_user_frags, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_mx_component.mx_recv_frags, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_mx_component.mx_procs, opal_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_mx_component.mx_pending_acks, opal_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_mx_component.mx_lock, opal_mutex_t);
|
|
|
|
ompi_free_list_init( &mca_btl_mx_component.mx_send_eager_frags,
|
|
sizeof(mca_btl_mx_frag_t) + mca_btl_mx_module.super.btl_eager_limit,
|
|
OBJ_CLASS(mca_btl_mx_frag_t),
|
|
mca_btl_mx_component.mx_free_list_num,
|
|
mca_btl_mx_component.mx_free_list_max,
|
|
mca_btl_mx_component.mx_free_list_inc,
|
|
NULL ); /* use default allocator */
|
|
|
|
ompi_free_list_init( &mca_btl_mx_component.mx_send_user_frags,
|
|
sizeof(mca_btl_mx_frag_t),
|
|
OBJ_CLASS(mca_btl_mx_frag_t),
|
|
mca_btl_mx_component.mx_free_list_num,
|
|
mca_btl_mx_component.mx_free_list_max,
|
|
mca_btl_mx_component.mx_free_list_inc,
|
|
NULL ); /* use default allocator */
|
|
|
|
ompi_free_list_init( &mca_btl_mx_component.mx_recv_frags,
|
|
sizeof(mca_btl_mx_frag_t),
|
|
OBJ_CLASS(mca_btl_mx_frag_t),
|
|
mca_btl_mx_component.mx_free_list_num,
|
|
mca_btl_mx_component.mx_free_list_max,
|
|
mca_btl_mx_component.mx_free_list_inc,
|
|
NULL ); /* use default allocator */
|
|
|
|
/* intialize process hash table */
|
|
OBJ_CONSTRUCT( &mca_btl_mx_component.mx_procs, opal_list_t );
|
|
|
|
/* get the number of card available on the system */
|
|
if( (status = mx_get_info( NULL, MX_NIC_COUNT, NULL, 0,
|
|
&mca_btl_mx_component.mx_num_btls, sizeof(uint32_t))) != MX_SUCCESS ) {
|
|
opal_output(0, "mca_btl_mx_component_init: mx_get_info(MX_NIC_COUNT) failed with status=%d\n", status);
|
|
mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version,
|
|
NULL, 0);
|
|
return NULL;
|
|
}
|
|
|
|
if (0 == mca_btl_mx_component.mx_num_btls) {
|
|
mca_btl_base_error_no_nics("Myrinet/MX", "NIC");
|
|
mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version,
|
|
NULL, 0);
|
|
return NULL;
|
|
}
|
|
|
|
/* check for limit on number of btls */
|
|
if(mca_btl_mx_component.mx_num_btls > mca_btl_mx_component.mx_max_btls)
|
|
mca_btl_mx_component.mx_num_btls = mca_btl_mx_component.mx_max_btls;
|
|
|
|
/* Now we know how many NIC are available on the system. We will create a BTL for each one
|
|
* and then give a pointer to the BTL to the upper level.
|
|
*/
|
|
mca_btl_mx_component.mx_btls = malloc( mca_btl_mx_component.mx_num_btls * sizeof(mca_btl_base_module_t*) );
|
|
if( NULL == mca_btl_mx_component.mx_btls )
|
|
return NULL;
|
|
|
|
/* determine the NIC ids */
|
|
size = sizeof(uint64_t) * (mca_btl_mx_component.mx_num_btls + 1);
|
|
if( NULL == (nic_addrs = (uint64_t*)malloc(size)) )
|
|
return NULL;
|
|
if( (status = mx_get_info( NULL, MX_NIC_IDS, NULL, 0,
|
|
nic_addrs, size)) != MX_SUCCESS) {
|
|
free(nic_addrs);
|
|
return NULL;
|
|
}
|
|
|
|
size = sizeof(mca_btl_mx_addr_t) * mca_btl_mx_component.mx_num_btls;
|
|
mx_addrs = (mca_btl_mx_addr_t*)malloc( size );
|
|
if( NULL == mx_addrs ) {
|
|
free( nic_addrs );
|
|
return NULL;
|
|
}
|
|
|
|
/* create a btl for each NIC */
|
|
for( i = count = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) {
|
|
mca_btl_mx_module_t* btl = mca_btl_mx_create(nic_addrs[i]);
|
|
if( NULL == btl ) {
|
|
continue;
|
|
}
|
|
status = mx_decompose_endpoint_addr( btl->mx_endpoint_addr, &(mx_addrs[i].nic_id),
|
|
&(mx_addrs[i].endpoint_id) );
|
|
if( MX_SUCCESS != status ) {
|
|
OBJ_RELEASE( btl );
|
|
continue;
|
|
}
|
|
mca_btl_mx_component.mx_btls[count++] = btl;
|
|
}
|
|
size = sizeof(mca_btl_mx_addr_t) * count;
|
|
if( 0 == count ) {
|
|
/* No active BTL module */
|
|
}
|
|
mca_btl_mx_component.mx_num_btls = count;
|
|
|
|
/* publish the MX addresses via the MCA framework */
|
|
mca_pml_base_modex_send( &mca_btl_mx_component.super.btl_version, mx_addrs, size );
|
|
|
|
free( nic_addrs );
|
|
free( mx_addrs );
|
|
|
|
btls = malloc( mca_btl_mx_component.mx_num_btls * sizeof(mca_btl_base_module_t*) );
|
|
|
|
if( NULL == btls ) {
|
|
free( mca_btl_mx_component.mx_btls );
|
|
mca_btl_mx_component.mx_num_btls = 0; /* no active BTL modules */
|
|
return NULL;
|
|
}
|
|
memcpy( btls, mca_btl_mx_component.mx_btls,
|
|
mca_btl_mx_component.mx_num_btls*sizeof(mca_btl_mx_module_t*) );
|
|
*num_btl_modules = mca_btl_mx_component.mx_num_btls;
|
|
return btls;
|
|
}
|
|
|
|
/*
|
|
* MX component progress.
|
|
*/
|
|
int mca_btl_mx_component_progress()
|
|
{
|
|
int32_t num_progressed = 0, i;
|
|
mx_status_t mx_status;
|
|
mx_return_t mx_return;
|
|
mx_segment_t mx_segment;
|
|
mx_request_t mx_request;
|
|
mca_btl_mx_frag_t* frag;
|
|
|
|
for( i = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) {
|
|
mca_btl_mx_module_t* mx_btl = mca_btl_mx_component.mx_btls[i];
|
|
uint32_t mx_result = 0;
|
|
|
|
/* pre-post receive */
|
|
#if 0
|
|
if( mx_btl->mx_recvs_posted == 0 ) {
|
|
OPAL_THREAD_ADD32( &mx_btl->mx_recvs_posted, 1 );
|
|
MCA_BTL_MX_POST( mx_btl, frag );
|
|
}
|
|
#endif
|
|
|
|
/*if( mx_btl->mx_posted_request ) { */
|
|
mx_return = mx_ipeek( mx_btl->mx_endpoint, &mx_request, &mx_result );
|
|
if( mx_return != MX_SUCCESS ) {
|
|
opal_output(0, "mca_btl_mx_component_progress: mx_ipeek() failed with status %d\n",
|
|
mx_return);
|
|
continue;
|
|
}
|
|
if( mx_result == 0 ) {
|
|
continue;
|
|
}
|
|
|
|
mx_return = mx_test( mx_btl->mx_endpoint, &mx_request, &mx_status, &mx_result);
|
|
if( mx_return != MX_SUCCESS ) {
|
|
opal_output(0, "mca_btl_mx_progress: mx_test() failed with status=%dn",
|
|
mx_return);
|
|
continue;
|
|
}
|
|
|
|
frag = mx_status.context;
|
|
if( 0 == frag->base.des_dst_cnt ) { /* it's a send */
|
|
/* call the completion callback */
|
|
frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS);
|
|
} else { /* and this one is a receive */
|
|
mca_btl_base_recv_reg_t* reg;
|
|
|
|
reg = &(mx_btl->mx_reg[frag->tag]);
|
|
frag->base.des_dst->seg_len = mx_status.msg_length;
|
|
reg->cbfunc( &(mx_btl->super), frag->tag, &(frag->base), reg->cbdata );
|
|
/*
|
|
* The upper level extract the data from the fragment. Now we can register the fragment
|
|
* again with the MX BTL.
|
|
*/
|
|
mx_segment.segment_ptr = frag->base.des_dst->seg_addr.pval;
|
|
mx_segment.segment_length = mca_btl_mx_module.super.btl_eager_limit;
|
|
mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1, (uint64_t)frag->tag,
|
|
(uint64_t)0xffffffffffffffffULL,
|
|
frag, &(frag->mx_request) );
|
|
if( MX_SUCCESS != mx_return ) {
|
|
opal_output( 0, "Fail to re-register a fragment with the MX NIC ...\n" );
|
|
}
|
|
}
|
|
|
|
/*MCA_BTL_MX_PROGRESS(mx_btl, mx_status);*/
|
|
/*
|
|
* on the mx_status we have now the pointer attached to the request. This pointer indicate
|
|
* which fragment we are working on. On the status we have the status of the operation, so
|
|
* we know what we are supposed to do next.
|
|
*/
|
|
num_progressed++;
|
|
}
|
|
return num_progressed;
|
|
}
|
|
|