Allow smart connection to be setup. Each peer now has attached to it thea unique
id based on the last half of the mapper MAC. This allow us to figure out how to connect peers. This allow the MX BTL to be used in a cluster of cluster configuration where each cluster have MX internally as well as on a multi rail MX system. This commit was SVN r14932.
Этот коммит содержится в:
родитель
e0e4163f53
Коммит
6a5e039466
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
|
@ -3,7 +3,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -107,18 +107,24 @@ typedef struct mca_btl_mx_component_t mca_btl_mx_component_t;
|
||||
OMPI_MODULE_DECLSPEC extern mca_btl_mx_component_t mca_btl_mx_component;
|
||||
|
||||
/**
|
||||
* BTL Module Interface
|
||||
* BTL Module Interface.
|
||||
* Each BTL correspond to a high level vision of a network interface. The
|
||||
* current version of the MX BTL is not able to handle stripping of the
|
||||
* messages by itself. Therefore, it rely on the PML layer for that.
|
||||
*/
|
||||
struct mca_btl_mx_module_t {
|
||||
mca_btl_base_module_t super; /**< base BTL interface */
|
||||
mca_btl_base_recv_reg_t mx_reg[MCA_BTL_TAG_MAX];
|
||||
mca_btl_base_module_t super; /**< base BTL interface */
|
||||
mca_btl_base_recv_reg_t mx_reg[MCA_BTL_TAG_MAX]; /**< the PML registered callbacks */
|
||||
mx_endpoint_t mx_endpoint; /**< local MX endpoint */
|
||||
mx_endpoint_addr_t mx_endpoint_addr; /**< local MX endpoint address */
|
||||
uint32_t mx_unique_network_id; /**< unique identifier for this BTL,
|
||||
* based on the MAC address of the
|
||||
* mapper used to route messages.
|
||||
*/
|
||||
opal_list_t mx_peers; /**< list of peers */
|
||||
|
||||
mx_endpoint_t mx_endpoint; /**< */
|
||||
mx_endpoint_addr_t mx_endpoint_addr; /**< */
|
||||
opal_list_t mx_peers; /**< */
|
||||
|
||||
int32_t mx_posted_request; /**< number of posted MX request */
|
||||
opal_mutex_t mx_lock; /**< lock for accessing module state */
|
||||
int32_t mx_posted_request; /**< number of posted MX request */
|
||||
opal_mutex_t mx_lock; /**< lock for accessing module state */
|
||||
};
|
||||
typedef struct mca_btl_mx_module_t mca_btl_mx_module_t;
|
||||
extern mca_btl_mx_module_t mca_btl_mx_module;
|
||||
|
@ -2,7 +2,7 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -36,6 +36,12 @@
|
||||
#include "btl_mx_frag.h"
|
||||
#include "btl_mx_endpoint.h"
|
||||
|
||||
#if MX_HAVE_MAPPER_STATE
|
||||
#include "mx_io.h"
|
||||
#include "mx_internals/mx__fops.h"
|
||||
#include "mx_internals/mx__driver_interface.h"
|
||||
#endif /* MX_HAVE_MAPPER_STATE */
|
||||
|
||||
mca_btl_mx_component_t mca_btl_mx_component = {
|
||||
{
|
||||
/* First, the mca_base_component_t struct containing meta information
|
||||
@ -239,6 +245,35 @@ static mca_btl_mx_module_t* mca_btl_mx_create(uint64_t addr)
|
||||
mca_btl_mx_finalize( &mx_btl->super );
|
||||
return NULL;
|
||||
}
|
||||
#if MX_HAVE_MAPPER_STATE
|
||||
{
|
||||
mx_return_t ret;
|
||||
mx_endpt_handle_t endp_handle;
|
||||
mx_mapper_state_t ms;
|
||||
|
||||
ret = mx_open_board( nic_id, &endp_handle );
|
||||
if( MX_SUCCESS != ret ) {
|
||||
opal_output( 0, "Unable to open board %d: %s\n", nic_id, mx_strerror(ret) );
|
||||
mca_btl_mx_finalize( &mx_btl->super );
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ms.board_number = nic_id;
|
||||
ms.iport = 0;
|
||||
ret = mx__get_mapper_state( endp_handle, &ms );
|
||||
if( MX_SUCCESS != ret ) {
|
||||
opal_output( 0, "get_mapper_state failed for board %d: %s\n",
|
||||
nic_id, mx_strerror(ret) );
|
||||
mca_btl_mx_finalize( &mx_btl->super );
|
||||
return NULL;
|
||||
}
|
||||
mx_btl->mx_unique_network_id = ((ms.mapper_mac[2] << 24) +
|
||||
(ms.mapper_mac[3] << 16) +
|
||||
(ms.mapper_mac[4] << 8) +
|
||||
(ms.mapper_mac[5]));
|
||||
}
|
||||
#endif /* MX_HAVE_MAPPER_STATE */
|
||||
|
||||
#if 0
|
||||
{
|
||||
int counters, board, i, value, *counters_value;
|
||||
@ -352,9 +387,8 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
|
||||
opal_setenv( "MX_PIPELINE_LOG", "0", true, &environ );
|
||||
|
||||
/* First check if MX is available ... */
|
||||
if(OMPI_SUCCESS!=ompi_common_mx_initialize()) {
|
||||
mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version,
|
||||
NULL, 0);
|
||||
if( OMPI_SUCCESS != ompi_common_mx_initialize() ) {
|
||||
mca_pml_base_modex_send( &mca_btl_mx_component.super.btl_version, NULL, 0 );
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -437,7 +471,7 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
|
||||
}
|
||||
|
||||
size = sizeof(mca_btl_mx_addr_t) * mca_btl_mx_component.mx_num_btls;
|
||||
mx_addrs = (mca_btl_mx_addr_t*)malloc( size );
|
||||
mx_addrs = (mca_btl_mx_addr_t*)calloc( mca_btl_mx_component.mx_num_btls, sizeof(mca_btl_mx_addr_t) );
|
||||
if( NULL == mx_addrs ) {
|
||||
free( nic_addrs );
|
||||
return NULL;
|
||||
@ -445,27 +479,30 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
|
||||
|
||||
/* create a btl for each NIC */
|
||||
for( i = count = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) {
|
||||
mca_btl_mx_module_t* btl = mca_btl_mx_create(nic_addrs[i]);
|
||||
if( NULL == btl ) {
|
||||
mca_btl_mx_module_t* mx_btl = mca_btl_mx_create(nic_addrs[i]);
|
||||
if( NULL == mx_btl ) {
|
||||
continue;
|
||||
}
|
||||
status = mx_decompose_endpoint_addr( btl->mx_endpoint_addr, &(mx_addrs[i].nic_id),
|
||||
status = mx_decompose_endpoint_addr( mx_btl->mx_endpoint_addr, &(mx_addrs[i].nic_id),
|
||||
&(mx_addrs[i].endpoint_id) );
|
||||
if( MX_SUCCESS != status ) {
|
||||
OBJ_RELEASE( btl );
|
||||
mca_btl_mx_finalize( &mx_btl->super );
|
||||
continue;
|
||||
}
|
||||
mx_addrs[i].unique_network_id = mx_btl->mx_unique_network_id;
|
||||
|
||||
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
BTL_MX_ADDR_HTON(mx_addrs[i]);
|
||||
#endif
|
||||
mca_btl_mx_component.mx_btls[count++] = btl;
|
||||
mca_btl_mx_component.mx_btls[count++] = mx_btl;
|
||||
}
|
||||
mca_btl_mx_component.mx_num_btls = count;
|
||||
*num_btl_modules = count;
|
||||
size = sizeof(mca_btl_mx_addr_t) * count;
|
||||
if( 0 == count ) {
|
||||
/* No active BTL module */
|
||||
return NULL;
|
||||
}
|
||||
mca_btl_mx_component.mx_num_btls = count;
|
||||
|
||||
/* publish the MX addresses via the MCA framework */
|
||||
mca_pml_base_modex_send( &mca_btl_mx_component.super.btl_version, mx_addrs, size );
|
||||
@ -482,7 +519,6 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
|
||||
}
|
||||
memcpy( btls, mca_btl_mx_component.mx_btls,
|
||||
mca_btl_mx_component.mx_num_btls*sizeof(mca_btl_mx_module_t*) );
|
||||
*num_btl_modules = mca_btl_mx_component.mx_num_btls;
|
||||
return btls;
|
||||
}
|
||||
|
||||
|
@ -43,23 +43,23 @@ extern "C" {
|
||||
struct mca_btl_mx_addr_t {
|
||||
uint64_t nic_id;
|
||||
uint32_t endpoint_id;
|
||||
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
uint8_t padding[4];
|
||||
#endif
|
||||
uint32_t unique_network_id; /* Unique identifier for each MX network */
|
||||
};
|
||||
typedef struct mca_btl_mx_addr_t mca_btl_mx_addr_t;
|
||||
|
||||
#define BTL_MX_ADDR_HTON(h) \
|
||||
do { \
|
||||
h.nic_id = hton64(h.nic_id); \
|
||||
h.endpoint_id = htonl(h.endpoint_id); \
|
||||
} while (0)
|
||||
#define BTL_MX_ADDR_HTON(h) \
|
||||
do { \
|
||||
h.nic_id = hton64(h.nic_id); \
|
||||
h.endpoint_id = htonl(h.endpoint_id); \
|
||||
h.unique_network_id = htonl(h.unique_network_id); \
|
||||
} while (0)
|
||||
|
||||
#define BTL_MX_ADDR_NTOH(h) \
|
||||
do { \
|
||||
h.nic_id = ntoh64(h.nic_id); \
|
||||
h.endpoint_id = ntohl(h.endpoint_id); \
|
||||
} while (0)
|
||||
#define BTL_MX_ADDR_NTOH(h) \
|
||||
do { \
|
||||
h.nic_id = ntoh64(h.nic_id); \
|
||||
h.endpoint_id = ntohl(h.endpoint_id); \
|
||||
h.unique_network_id = ntohl(h.unique_network_id); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/**
|
||||
|
@ -135,7 +135,8 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc,
|
||||
mca_btl_mx_endpoint_t* module_endpoint )
|
||||
{
|
||||
mca_btl_mx_addr_t *mx_peers;
|
||||
int rc;
|
||||
int rc, i, j;
|
||||
mca_btl_mx_module_t* mx_btl;
|
||||
size_t size;
|
||||
|
||||
/* query for the peer address info */
|
||||
@ -160,14 +161,41 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc,
|
||||
}
|
||||
|
||||
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
{
|
||||
int i;
|
||||
for (i = 0 ; i < module_proc->mx_peers_count ; ++i) {
|
||||
BTL_MX_ADDR_NTOH(mx_peers[i]);
|
||||
}
|
||||
for (i = 0 ; i < module_proc->mx_peers_count ; ++i) {
|
||||
BTL_MX_ADDR_NTOH(mx_peers[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Check if there is any Myrinet network between myself and the peer
|
||||
*/
|
||||
for( i = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) {
|
||||
mx_btl = mca_btl_mx_component.mx_btls[i];
|
||||
|
||||
for( j = 0; j < module_proc->mx_peers_count; j++ ) {
|
||||
if( mx_btl->mx_unique_network_id == mx_peers[j].unique_network_id ) {
|
||||
/* There is at least one connection between these two nodes */
|
||||
goto create_peer_endpoint;
|
||||
}
|
||||
}
|
||||
}
|
||||
module_proc->mx_peers_count = 0;
|
||||
/**
|
||||
* No Myrinet connectivity. Let the PML layer figure out another
|
||||
* way to communicate with the peer.
|
||||
*/
|
||||
return OMPI_ERROR;
|
||||
create_peer_endpoint:
|
||||
mx_btl = module_endpoint->endpoint_btl;
|
||||
for( j = 0; j < module_proc->mx_peers_count; j++ ) {
|
||||
if( mx_btl->mx_unique_network_id == mx_peers[j].unique_network_id ) {
|
||||
module_endpoint->mx_peer.nic_id = mx_peers[j].nic_id;
|
||||
module_endpoint->mx_peer.endpoint_id = mx_peers[j].endpoint_id;
|
||||
module_proc->proc_addr_index = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
module_proc->mx_peers = mx_peers;
|
||||
|
||||
if( NULL == module_proc->proc_endpoints ) {
|
||||
@ -191,39 +219,28 @@ int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint )
|
||||
|
||||
module_endpoint->status = MCA_BTL_MX_CONNECTION_PENDING;
|
||||
|
||||
for( i = module_proc->proc_addr_index; i < module_proc->mx_peers_count; i++ ) {
|
||||
|
||||
retry_connect:
|
||||
mx_status = mx_connect( module_endpoint->endpoint_btl->mx_endpoint,
|
||||
module_proc->mx_peers[i].nic_id, module_proc->mx_peers[i].endpoint_id,
|
||||
mca_btl_mx_component.mx_filter, mca_btl_mx_component.mx_timeout, &mx_remote_addr );
|
||||
if( MX_SUCCESS != mx_status ) {
|
||||
if( MX_TIMEOUT == mx_status )
|
||||
if( num_retry++ < mca_btl_mx_component.mx_connection_retries )
|
||||
goto retry_connect;
|
||||
{
|
||||
char peer_name[MX_MAX_HOSTNAME_LEN];
|
||||
|
||||
if( MX_SUCCESS != mx_nic_id_to_hostname( module_proc->mx_peers[i].nic_id, peer_name ) )
|
||||
sprintf( peer_name, "unknown %lx nic_id", (long)module_proc->mx_peers[i].nic_id );
|
||||
|
||||
opal_output( 0, "mx_connect fail for %s(%dth remote address) with key %x (error %s)\n",
|
||||
peer_name, i, mca_btl_mx_component.mx_filter, mx_strerror(mx_status) );
|
||||
}
|
||||
continue;
|
||||
retry_connect:
|
||||
mx_status = mx_connect( module_endpoint->endpoint_btl->mx_endpoint,
|
||||
module_endpoint->mx_peer.nic_id, module_endpoint->mx_peer.endpoint_id,
|
||||
mca_btl_mx_component.mx_filter, mca_btl_mx_component.mx_timeout, &mx_remote_addr );
|
||||
if( MX_SUCCESS != mx_status ) {
|
||||
if( MX_TIMEOUT == mx_status )
|
||||
if( num_retry++ < mca_btl_mx_component.mx_connection_retries )
|
||||
goto retry_connect;
|
||||
{
|
||||
char peer_name[MX_MAX_HOSTNAME_LEN];
|
||||
|
||||
if( MX_SUCCESS != mx_nic_id_to_hostname( module_endpoint->mx_peer.nic_id, peer_name ) )
|
||||
sprintf( peer_name, "unknown %lx nic_id", (long)module_endpoint->mx_peer.nic_id );
|
||||
|
||||
opal_output( 0, "mx_connect fail for %s(%dth remote address) with key %x (error %s)\n",
|
||||
peer_name, i, mca_btl_mx_component.mx_filter, mx_strerror(mx_status) );
|
||||
}
|
||||
module_endpoint->mx_peer.nic_id = module_proc->mx_peers[i].nic_id;
|
||||
module_endpoint->mx_peer.endpoint_id = module_proc->mx_peers[i].endpoint_id;
|
||||
module_endpoint->mx_peer_addr = mx_remote_addr;
|
||||
module_endpoint->status = MCA_BTL_MX_CONNECTED;
|
||||
module_proc->proc_addr_index = i;
|
||||
break;
|
||||
}
|
||||
|
||||
if( i == module_proc->mx_peers_count ) { /* no available connection */
|
||||
module_endpoint->status = MCA_BTL_MX_NOT_REACHEABLE;
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
module_endpoint->mx_peer_addr = mx_remote_addr;
|
||||
module_endpoint->status = MCA_BTL_MX_CONNECTED;
|
||||
|
||||
module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = module_endpoint;
|
||||
return OMPI_SUCCESS;
|
||||
|
@ -3,7 +3,7 @@
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
# Copyright (c) 2004-2007 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user