1
1

A better version alowing for multi-rails or clusters of clusters. A lot of cleanups.

This commit was SVN r14963.
Этот коммит содержится в:
George Bosilca 2007-06-08 20:37:20 +00:00
родитель c66cf32ee2
Коммит e2dd0a50fc
3 изменённых файлов: 128 добавлений и 92 удалений

Просмотреть файл

@ -117,7 +117,7 @@ struct mca_btl_mx_module_t {
mca_btl_base_module_t super; /**< base BTL interface */
mca_btl_base_recv_reg_t mx_reg[MCA_BTL_TAG_MAX]; /**< the PML registered callbacks */
mx_endpoint_t mx_endpoint; /**< local MX endpoint */
mx_endpoint_addr_t mx_endpoint_addr; /**< local MX endpoint address */
mx_endpoint_addr_t mx_endpoint_addr; /**< local MX endpoint address */
uint32_t mx_unique_network_id; /**< unique identifier for this BTL,
* based on the MAC address of the
* mapper used to route messages.

Просмотреть файл

@ -130,9 +130,18 @@ int mca_btl_mx_component_open(void)
}
mca_base_param_reg_int( (mca_base_component_t*)&mca_btl_mx_component, "max_posted_recv",
"Number of received posted in advance. Increasing this number for communication bound application can lead to visible improvement in performances",
"Number of received posted in advance. Increasing this number for"
" communication bound application can lead to visible improvement"
" in performances",
false, false, 16, &mca_btl_mx_component.mx_max_posted_recv );
mca_base_param_reg_string( (mca_base_component_t*)&mca_btl_mx_component, "if_include",
"Myrinet card to use (last 6 digits from the mapper MAC)",
false, false, NULL, &mca_btl_mx_component.mx_if_include );
mca_base_param_reg_string( (mca_base_component_t*)&mca_btl_mx_component, "if_exclude",
"Myrinet card to avoid (last 6 digits from the mapper MAC)",
false, false, NULL, &mca_btl_mx_component.mx_if_exclude );
mca_btl_mx_module.super.btl_exclusivity = 50;
mca_btl_mx_module.super.btl_eager_limit = 4096;
mca_btl_mx_module.super.btl_min_send_size = 4096;
@ -166,6 +175,15 @@ int mca_btl_mx_component_close(void)
OBJ_DESTRUCT(&mca_btl_mx_component.mx_send_user_frags);
OBJ_DESTRUCT(&mca_btl_mx_component.mx_procs);
OBJ_DESTRUCT(&mca_btl_mx_component.mx_lock);
if( NULL != mca_btl_mx_component.mx_if_include ) {
free( mca_btl_mx_component.mx_if_include );
mca_btl_mx_component.mx_if_include = NULL;
}
if( NULL != mca_btl_mx_component.mx_if_exclude ) {
free( mca_btl_mx_component.mx_if_exclude );
mca_btl_mx_component.mx_if_exclude = NULL;
}
return OMPI_SUCCESS;
}
@ -218,13 +236,58 @@ static mca_btl_mx_module_t* mca_btl_mx_create(uint64_t addr)
{
mca_btl_mx_module_t* mx_btl;
mx_return_t status;
uint32_t nic_id;
uint32_t nic_id, mx_unique_network_id = 0;
char mapper_mac[7], *where;
status = mx_nic_id_to_board_number( addr, &nic_id );
if( MX_SUCCESS != status ) {
return NULL;
}
#if MX_HAVE_MAPPER_STATE
{
mx_return_t ret;
mx_endpt_handle_t endp_handle;
mx_mapper_state_t ms;
ret = mx_open_board( nic_id, &endp_handle );
if( MX_SUCCESS != ret ) {
opal_output( 0, "Unable to open board %d: %s\n", nic_id, mx_strerror(ret) );
return NULL;
}
ms.board_number = nic_id;
ms.iport = 0;
ret = mx__get_mapper_state( endp_handle, &ms );
if( MX_SUCCESS != ret ) {
opal_output( 0, "get_mapper_state failed for board %d: %s\n",
nic_id, mx_strerror(ret) );
return NULL;
}
/* Keep the first 4 bytes for the network speed */
mx_unique_network_id = ((ms.mapper_mac[3] << 16) +
(ms.mapper_mac[4] << 8) +
(ms.mapper_mac[5]));
}
#endif /* MX_HAVE_MAPPER_STATE */
/* Try to figure out if we are allowed to use this network */
snprintf( mapper_mac, 7, "%6x", mx_unique_network_id );
if( (NULL != mca_btl_mx_component.mx_if_exclude) &&
(NULL != (where = strstr(mca_btl_mx_component.mx_if_exclude, mapper_mac))) ) {
opal_output( 0, "MX network %d connected to the mapper %s has been excluded\n",
nic_id, mapper_mac );
return NULL;
}
else if( (NULL != mca_btl_mx_component.mx_if_include) &&
(NULL == (where = strstr(mca_btl_mx_component.mx_if_include, mapper_mac))) ) {
opal_output( 0, "MX network %d connected to the mapper %s has not been included\n",
nic_id, mapper_mac );
return NULL;
}
mx_btl = malloc(sizeof(mca_btl_mx_module_t));
if( NULL == mx_btl ) return NULL;
@ -243,52 +306,27 @@ static mca_btl_mx_module_t* mca_btl_mx_create(uint64_t addr)
mca_btl_mx_finalize( &mx_btl->super );
return NULL;
}
#if MX_HAVE_MAPPER_STATE
{
mx_return_t ret;
mx_endpt_handle_t endp_handle;
mx_mapper_state_t ms;
ret = mx_open_board( nic_id, &endp_handle );
if( MX_SUCCESS != ret ) {
opal_output( 0, "Unable to open board %d: %s\n", nic_id, mx_strerror(ret) );
mca_btl_mx_finalize( &mx_btl->super );
return NULL;
}
ms.board_number = nic_id;
ms.iport = 0;
ret = mx__get_mapper_state( endp_handle, &ms );
if( MX_SUCCESS != ret ) {
opal_output( 0, "get_mapper_state failed for board %d: %s\n",
nic_id, mx_strerror(ret) );
mca_btl_mx_finalize( &mx_btl->super );
return NULL;
}
/* Keep the first 4 bytes for the network speed */
mx_btl->mx_unique_network_id = ((ms.mapper_mac[3] << 16) +
(ms.mapper_mac[4] << 8) +
(ms.mapper_mac[5]));
mx_btl->mx_unique_network_id = mx_unique_network_id;
#if defined(MX_HAS_NET_TYPE)
{
int value;
if( (status = mx_get_info( mx_btl->mx_endpoint, MX_LINE_SPEED, NULL, 0,
&value, sizeof(int))) != MX_SUCCESS ) {
opal_output( 0, "mx_get_info(MX_LINE_SPEED) failed with status %d (%s)\n",
status, mx_strerror(status) );
}
if( MX_SPEED_2G == value ) {
mx_btl->mx_unique_network_id |= 0xaa00000000;
mx_btl->super.btl_bandwidth = 2000;
} else if( MX_SPEED_10G == value ) {
mx_btl->mx_unique_network_id |= 0xbb00000000;
mx_btl->super.btl_bandwidth = 10000;
} else {
mx_btl->mx_unique_network_id |= 0xcc00000000;
mx_btl->super.btl_bandwidth = 1000; /* some value */
}
#endif /* defined(MX_HAS_NET_TYPE) */
}
if( MX_SPEED_2G == value ) {
mx_btl->mx_unique_network_id |= 0xaa00000000;
mx_btl->super.btl_bandwidth = 2000;
} else if( MX_SPEED_10G == value ) {
mx_btl->mx_unique_network_id |= 0xbb00000000;
mx_btl->super.btl_bandwidth = 10000;
} else {
mx_btl->mx_unique_network_id |= 0xcc00000000;
mx_btl->super.btl_bandwidth = 1000; /* whatever */
}
}
#endif /* MX_HAVE_MAPPER_STATE */
#endif /* defined(MX_HAS_NET_TYPE) */
#if 0
{
@ -476,7 +514,6 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
return NULL;
}
size = sizeof(mca_btl_mx_addr_t) * mca_btl_mx_component.mx_num_btls;
mx_addrs = (mca_btl_mx_addr_t*)calloc( mca_btl_mx_component.mx_num_btls, sizeof(mca_btl_mx_addr_t) );
if( NULL == mx_addrs ) {
free( nic_addrs );
@ -489,29 +526,30 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
if( NULL == mx_btl ) {
continue;
}
status = mx_decompose_endpoint_addr( mx_btl->mx_endpoint_addr, &(mx_addrs[i].nic_id),
&(mx_addrs[i].endpoint_id) );
status = mx_decompose_endpoint_addr( mx_btl->mx_endpoint_addr, &(mx_addrs[count].nic_id),
&(mx_addrs[count].endpoint_id) );
if( MX_SUCCESS != status ) {
mca_btl_mx_finalize( &mx_btl->super );
continue;
}
mx_addrs[i].unique_network_id = mx_btl->mx_unique_network_id;
mx_addrs[count].unique_network_id = mx_btl->mx_unique_network_id;
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
BTL_MX_ADDR_HTON(mx_addrs[i]);
BTL_MX_ADDR_HTON(mx_addrs[count]);
#endif
mca_btl_mx_component.mx_btls[count++] = mx_btl;
mca_btl_mx_component.mx_btls[count] = mx_btl;
count++; /* one more succesfully initialized MX interface */
}
mca_btl_mx_component.mx_num_btls = count;
*num_btl_modules = count;
size = sizeof(mca_btl_mx_addr_t) * count;
if( 0 == count ) {
/* No active BTL module */
return NULL;
}
/* publish the MX addresses via the MCA framework */
mca_pml_base_modex_send( &mca_btl_mx_component.super.btl_version, mx_addrs, size );
mca_pml_base_modex_send( &mca_btl_mx_component.super.btl_version, mx_addrs,
sizeof(mca_btl_mx_addr_t) * mca_btl_mx_component.mx_num_btls );
free( nic_addrs );
free( mx_addrs );

Просмотреть файл

@ -105,6 +105,9 @@ static mca_btl_mx_proc_t* mca_btl_mx_proc_lookup_ompi(ompi_proc_t* ompi_proc)
mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
{
mca_btl_mx_proc_t* module_proc = NULL;
mca_btl_mx_addr_t *mx_peers;
int rc, i;
size_t size;
/* Check if we have already created a MX proc
* structure for this ompi process */
@ -114,13 +117,37 @@ mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
return module_proc;
}
/* Oops! First time, gotta create a new MX proc
* out of the ompi_proc ... */
/* query for the peer address info */
rc = mca_pml_base_modex_recv( &mca_btl_mx_component.super.btl_version,
ompi_proc, (void*)&mx_peers, &size );
if( OMPI_SUCCESS != rc ) {
opal_output( 0, "mca_pml_base_modex_recv failed for peer [%ld,%ld,%ld]",
ORTE_NAME_ARGS(&ompi_proc->proc_name) );
return NULL;
}
if( size < sizeof(mca_btl_mx_addr_t) ) { /* no available connection */
return NULL;
}
if( (size % sizeof(mca_btl_mx_addr_t)) != 0 ) {
opal_output( 0, "invalid mx address for peer [%ld,%ld,%ld]",
ORTE_NAME_ARGS(&ompi_proc->proc_name) );
return NULL;
}
module_proc = OBJ_NEW(mca_btl_mx_proc_t);
module_proc->proc_ompi = ompi_proc;
module_proc->mx_peers_count = size / sizeof(mca_btl_mx_addr_t);
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
for (i = 0 ; i < module_proc->mx_peers_count ; ++i) {
BTL_MX_ADDR_NTOH(mx_peers[i]);
}
#endif
module_proc->mx_peers = mx_peers;
return module_proc;
}
@ -133,37 +160,8 @@ mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc,
mca_btl_mx_endpoint_t* module_endpoint )
{
mca_btl_mx_addr_t *mx_peers;
int rc, i, j;
mca_btl_mx_module_t* mx_btl;
size_t size;
/* query for the peer address info */
rc = mca_pml_base_modex_recv( &mca_btl_mx_component.super.btl_version,
module_proc->proc_ompi, (void*)&mx_peers, &size );
if( OMPI_SUCCESS != rc ) {
opal_output( 0, "mca_pml_base_modex_recv failed for peer [%ld,%ld,%ld]",
ORTE_NAME_ARGS(&module_proc->proc_ompi->proc_name) );
OBJ_RELEASE(module_proc);
return OMPI_ERR_OUT_OF_RESOURCE;
}
if( (size % sizeof(mca_btl_mx_addr_t)) != 0 ) {
opal_output( 0, "invalid mx address for peer [%ld,%ld,%ld]",
ORTE_NAME_ARGS(&module_proc->proc_ompi->proc_name) );
OBJ_RELEASE(module_proc);
return OMPI_ERROR;
}
module_proc->mx_peers_count = size / sizeof(mca_btl_mx_addr_t);
if( 0 == module_proc->mx_peers_count ) { /* no available connection */
return OMPI_ERROR;
}
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
for (i = 0 ; i < module_proc->mx_peers_count ; ++i) {
BTL_MX_ADDR_NTOH(mx_peers[i]);
}
#endif
int i, j;
/**
* Check if there is any Myrinet network between myself and the peer
@ -172,7 +170,7 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc,
mx_btl = mca_btl_mx_component.mx_btls[i];
for( j = 0; j < module_proc->mx_peers_count; j++ ) {
if( mx_btl->mx_unique_network_id == mx_peers[j].unique_network_id ) {
if( mx_btl->mx_unique_network_id == module_proc->mx_peers[j].unique_network_id ) {
/* There is at least one connection between these two nodes */
goto create_peer_endpoint;
}
@ -187,15 +185,13 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc,
create_peer_endpoint:
mx_btl = module_endpoint->endpoint_btl;
for( j = 0; j < module_proc->mx_peers_count; j++ ) {
if( mx_btl->mx_unique_network_id == mx_peers[j].unique_network_id ) {
module_endpoint->mx_peer.nic_id = mx_peers[j].nic_id;
module_endpoint->mx_peer.endpoint_id = mx_peers[j].endpoint_id;
if( mx_btl->mx_unique_network_id == module_proc->mx_peers[j].unique_network_id ) {
module_endpoint->mx_peer.nic_id = module_proc->mx_peers[j].nic_id;
module_endpoint->mx_peer.endpoint_id = module_proc->mx_peers[j].endpoint_id;
break;
}
}
module_proc->mx_peers = mx_peers;
if( NULL == module_proc->proc_endpoints ) {
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(module_proc->mx_peers_count * sizeof(mca_btl_base_endpoint_t*));
@ -231,8 +227,10 @@ int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint )
if( MX_SUCCESS != mx_nic_id_to_hostname( module_endpoint->mx_peer.nic_id, peer_name ) )
sprintf( peer_name, "unknown %lx nic_id", (long)module_endpoint->mx_peer.nic_id );
opal_output( 0, "mx_connect fail for %s with key %x (error %s)\n",
peer_name, mca_btl_mx_component.mx_filter, mx_strerror(mx_status) );
opal_output( 0, "mx_connect fail for %s with key %x (error %s)\n\tUnique ID (local %x remote %x)\n",
peer_name, mca_btl_mx_component.mx_filter, mx_strerror(mx_status),
module_endpoint->endpoint_btl->mx_unique_network_id,
module_endpoint->mx_peer.unique_network_id );
}
module_endpoint->status = MCA_BTL_MX_NOT_REACHEABLE;
return OMPI_ERROR;