diff --git a/ompi/mca/btl/mx/btl_mx.c b/ompi/mca/btl/mx/btl_mx.c index 641700134c..4e3ff7c175 100644 --- a/ompi/mca/btl/mx/btl_mx.c +++ b/ompi/mca/btl/mx/btl_mx.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University + * Copyright (c) 2004-2007 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/btl/mx/btl_mx.h b/ompi/mca/btl/mx/btl_mx.h index cb127acb0a..1ccc8a1053 100644 --- a/ompi/mca/btl/mx/btl_mx.h +++ b/ompi/mca/btl/mx/btl_mx.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University + * Copyright (c) 2004-2007 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -107,18 +107,24 @@ typedef struct mca_btl_mx_component_t mca_btl_mx_component_t; OMPI_MODULE_DECLSPEC extern mca_btl_mx_component_t mca_btl_mx_component; /** - * BTL Module Interface + * BTL Module Interface. + * Each BTL correspond to a high level vision of a network interface. The + * current version of the MX BTL is not able to handle stripping of the + * messages by itself. Therefore, it rely on the PML layer for that. */ struct mca_btl_mx_module_t { - mca_btl_base_module_t super; /**< base BTL interface */ - mca_btl_base_recv_reg_t mx_reg[MCA_BTL_TAG_MAX]; + mca_btl_base_module_t super; /**< base BTL interface */ + mca_btl_base_recv_reg_t mx_reg[MCA_BTL_TAG_MAX]; /**< the PML registered callbacks */ + mx_endpoint_t mx_endpoint; /**< local MX endpoint */ + mx_endpoint_addr_t mx_endpoint_addr; /**< local MX endpoint address */ + uint32_t mx_unique_network_id; /**< unique identifier for this BTL, + * based on the MAC address of the + * mapper used to route messages. + */ + opal_list_t mx_peers; /**< list of peers */ - mx_endpoint_t mx_endpoint; /**< */ - mx_endpoint_addr_t mx_endpoint_addr; /**< */ - opal_list_t mx_peers; /**< */ - - int32_t mx_posted_request; /**< number of posted MX request */ - opal_mutex_t mx_lock; /**< lock for accessing module state */ + int32_t mx_posted_request; /**< number of posted MX request */ + opal_mutex_t mx_lock; /**< lock for accessing module state */ }; typedef struct mca_btl_mx_module_t mca_btl_mx_module_t; extern mca_btl_mx_module_t mca_btl_mx_module; diff --git a/ompi/mca/btl/mx/btl_mx_component.c b/ompi/mca/btl/mx/btl_mx_component.c index eac8e290d6..1d1193809d 100644 --- a/ompi/mca/btl/mx/btl_mx_component.c +++ b/ompi/mca/btl/mx/btl_mx_component.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University + * Copyright (c) 2004-2007 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -36,6 +36,12 @@ #include "btl_mx_frag.h" #include "btl_mx_endpoint.h" +#if MX_HAVE_MAPPER_STATE +#include "mx_io.h" +#include "mx_internals/mx__fops.h" +#include "mx_internals/mx__driver_interface.h" +#endif /* MX_HAVE_MAPPER_STATE */ + mca_btl_mx_component_t mca_btl_mx_component = { { /* First, the mca_base_component_t struct containing meta information @@ -239,6 +245,35 @@ static mca_btl_mx_module_t* mca_btl_mx_create(uint64_t addr) mca_btl_mx_finalize( &mx_btl->super ); return NULL; } +#if MX_HAVE_MAPPER_STATE + { + mx_return_t ret; + mx_endpt_handle_t endp_handle; + mx_mapper_state_t ms; + + ret = mx_open_board( nic_id, &endp_handle ); + if( MX_SUCCESS != ret ) { + opal_output( 0, "Unable to open board %d: %s\n", nic_id, mx_strerror(ret) ); + mca_btl_mx_finalize( &mx_btl->super ); + return NULL; + } + + ms.board_number = nic_id; + ms.iport = 0; + ret = mx__get_mapper_state( endp_handle, &ms ); + if( MX_SUCCESS != ret ) { + opal_output( 0, "get_mapper_state failed for board %d: %s\n", + nic_id, mx_strerror(ret) ); + mca_btl_mx_finalize( &mx_btl->super ); + return NULL; + } + mx_btl->mx_unique_network_id = ((ms.mapper_mac[2] << 24) + + (ms.mapper_mac[3] << 16) + + (ms.mapper_mac[4] << 8) + + (ms.mapper_mac[5])); + } +#endif /* MX_HAVE_MAPPER_STATE */ + #if 0 { int counters, board, i, value, *counters_value; @@ -352,9 +387,8 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules, opal_setenv( "MX_PIPELINE_LOG", "0", true, &environ ); /* First check if MX is available ... */ - if(OMPI_SUCCESS!=ompi_common_mx_initialize()) { - mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version, - NULL, 0); + if( OMPI_SUCCESS != ompi_common_mx_initialize() ) { + mca_pml_base_modex_send( &mca_btl_mx_component.super.btl_version, NULL, 0 ); return NULL; } @@ -437,7 +471,7 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules, } size = sizeof(mca_btl_mx_addr_t) * mca_btl_mx_component.mx_num_btls; - mx_addrs = (mca_btl_mx_addr_t*)malloc( size ); + mx_addrs = (mca_btl_mx_addr_t*)calloc( mca_btl_mx_component.mx_num_btls, sizeof(mca_btl_mx_addr_t) ); if( NULL == mx_addrs ) { free( nic_addrs ); return NULL; @@ -445,27 +479,30 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules, /* create a btl for each NIC */ for( i = count = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) { - mca_btl_mx_module_t* btl = mca_btl_mx_create(nic_addrs[i]); - if( NULL == btl ) { + mca_btl_mx_module_t* mx_btl = mca_btl_mx_create(nic_addrs[i]); + if( NULL == mx_btl ) { continue; } - status = mx_decompose_endpoint_addr( btl->mx_endpoint_addr, &(mx_addrs[i].nic_id), + status = mx_decompose_endpoint_addr( mx_btl->mx_endpoint_addr, &(mx_addrs[i].nic_id), &(mx_addrs[i].endpoint_id) ); if( MX_SUCCESS != status ) { - OBJ_RELEASE( btl ); + mca_btl_mx_finalize( &mx_btl->super ); continue; } + mx_addrs[i].unique_network_id = mx_btl->mx_unique_network_id; #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT BTL_MX_ADDR_HTON(mx_addrs[i]); #endif - mca_btl_mx_component.mx_btls[count++] = btl; + mca_btl_mx_component.mx_btls[count++] = mx_btl; } + mca_btl_mx_component.mx_num_btls = count; + *num_btl_modules = count; size = sizeof(mca_btl_mx_addr_t) * count; if( 0 == count ) { /* No active BTL module */ + return NULL; } - mca_btl_mx_component.mx_num_btls = count; /* publish the MX addresses via the MCA framework */ mca_pml_base_modex_send( &mca_btl_mx_component.super.btl_version, mx_addrs, size ); @@ -482,7 +519,6 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules, } memcpy( btls, mca_btl_mx_component.mx_btls, mca_btl_mx_component.mx_num_btls*sizeof(mca_btl_mx_module_t*) ); - *num_btl_modules = mca_btl_mx_component.mx_num_btls; return btls; } diff --git a/ompi/mca/btl/mx/btl_mx_endpoint.h b/ompi/mca/btl/mx/btl_mx_endpoint.h index 30fc463955..c82d306a8c 100644 --- a/ompi/mca/btl/mx/btl_mx_endpoint.h +++ b/ompi/mca/btl/mx/btl_mx_endpoint.h @@ -43,23 +43,23 @@ extern "C" { struct mca_btl_mx_addr_t { uint64_t nic_id; uint32_t endpoint_id; -#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT - uint8_t padding[4]; -#endif + uint32_t unique_network_id; /* Unique identifier for each MX network */ }; typedef struct mca_btl_mx_addr_t mca_btl_mx_addr_t; -#define BTL_MX_ADDR_HTON(h) \ -do { \ - h.nic_id = hton64(h.nic_id); \ - h.endpoint_id = htonl(h.endpoint_id); \ -} while (0) +#define BTL_MX_ADDR_HTON(h) \ + do { \ + h.nic_id = hton64(h.nic_id); \ + h.endpoint_id = htonl(h.endpoint_id); \ + h.unique_network_id = htonl(h.unique_network_id); \ + } while (0) -#define BTL_MX_ADDR_NTOH(h) \ -do { \ - h.nic_id = ntoh64(h.nic_id); \ - h.endpoint_id = ntohl(h.endpoint_id); \ -} while (0) +#define BTL_MX_ADDR_NTOH(h) \ + do { \ + h.nic_id = ntoh64(h.nic_id); \ + h.endpoint_id = ntohl(h.endpoint_id); \ + h.unique_network_id = ntohl(h.unique_network_id); \ + } while (0) /** diff --git a/ompi/mca/btl/mx/btl_mx_proc.c b/ompi/mca/btl/mx/btl_mx_proc.c index 3ccda61822..e63530e337 100644 --- a/ompi/mca/btl/mx/btl_mx_proc.c +++ b/ompi/mca/btl/mx/btl_mx_proc.c @@ -135,7 +135,8 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc, mca_btl_mx_endpoint_t* module_endpoint ) { mca_btl_mx_addr_t *mx_peers; - int rc; + int rc, i, j; + mca_btl_mx_module_t* mx_btl; size_t size; /* query for the peer address info */ @@ -160,14 +161,41 @@ int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc, } #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT - { - int i; - for (i = 0 ; i < module_proc->mx_peers_count ; ++i) { - BTL_MX_ADDR_NTOH(mx_peers[i]); - } + for (i = 0 ; i < module_proc->mx_peers_count ; ++i) { + BTL_MX_ADDR_NTOH(mx_peers[i]); } #endif + /** + * Check if there is any Myrinet network between myself and the peer + */ + for( i = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) { + mx_btl = mca_btl_mx_component.mx_btls[i]; + + for( j = 0; j < module_proc->mx_peers_count; j++ ) { + if( mx_btl->mx_unique_network_id == mx_peers[j].unique_network_id ) { + /* There is at least one connection between these two nodes */ + goto create_peer_endpoint; + } + } + } + module_proc->mx_peers_count = 0; + /** + * No Myrinet connectivity. Let the PML layer figure out another + * way to communicate with the peer. + */ + return OMPI_ERROR; + create_peer_endpoint: + mx_btl = module_endpoint->endpoint_btl; + for( j = 0; j < module_proc->mx_peers_count; j++ ) { + if( mx_btl->mx_unique_network_id == mx_peers[j].unique_network_id ) { + module_endpoint->mx_peer.nic_id = mx_peers[j].nic_id; + module_endpoint->mx_peer.endpoint_id = mx_peers[j].endpoint_id; + module_proc->proc_addr_index = j; + break; + } + } + module_proc->mx_peers = mx_peers; if( NULL == module_proc->proc_endpoints ) { @@ -191,39 +219,28 @@ int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint ) module_endpoint->status = MCA_BTL_MX_CONNECTION_PENDING; - for( i = module_proc->proc_addr_index; i < module_proc->mx_peers_count; i++ ) { - - retry_connect: - mx_status = mx_connect( module_endpoint->endpoint_btl->mx_endpoint, - module_proc->mx_peers[i].nic_id, module_proc->mx_peers[i].endpoint_id, - mca_btl_mx_component.mx_filter, mca_btl_mx_component.mx_timeout, &mx_remote_addr ); - if( MX_SUCCESS != mx_status ) { - if( MX_TIMEOUT == mx_status ) - if( num_retry++ < mca_btl_mx_component.mx_connection_retries ) - goto retry_connect; - { - char peer_name[MX_MAX_HOSTNAME_LEN]; - - if( MX_SUCCESS != mx_nic_id_to_hostname( module_proc->mx_peers[i].nic_id, peer_name ) ) - sprintf( peer_name, "unknown %lx nic_id", (long)module_proc->mx_peers[i].nic_id ); - - opal_output( 0, "mx_connect fail for %s(%dth remote address) with key %x (error %s)\n", - peer_name, i, mca_btl_mx_component.mx_filter, mx_strerror(mx_status) ); - } - continue; + retry_connect: + mx_status = mx_connect( module_endpoint->endpoint_btl->mx_endpoint, + module_endpoint->mx_peer.nic_id, module_endpoint->mx_peer.endpoint_id, + mca_btl_mx_component.mx_filter, mca_btl_mx_component.mx_timeout, &mx_remote_addr ); + if( MX_SUCCESS != mx_status ) { + if( MX_TIMEOUT == mx_status ) + if( num_retry++ < mca_btl_mx_component.mx_connection_retries ) + goto retry_connect; + { + char peer_name[MX_MAX_HOSTNAME_LEN]; + + if( MX_SUCCESS != mx_nic_id_to_hostname( module_endpoint->mx_peer.nic_id, peer_name ) ) + sprintf( peer_name, "unknown %lx nic_id", (long)module_endpoint->mx_peer.nic_id ); + + opal_output( 0, "mx_connect fail for %s(%dth remote address) with key %x (error %s)\n", + peer_name, i, mca_btl_mx_component.mx_filter, mx_strerror(mx_status) ); } - module_endpoint->mx_peer.nic_id = module_proc->mx_peers[i].nic_id; - module_endpoint->mx_peer.endpoint_id = module_proc->mx_peers[i].endpoint_id; - module_endpoint->mx_peer_addr = mx_remote_addr; - module_endpoint->status = MCA_BTL_MX_CONNECTED; - module_proc->proc_addr_index = i; - break; - } - - if( i == module_proc->mx_peers_count ) { /* no available connection */ module_endpoint->status = MCA_BTL_MX_NOT_REACHEABLE; return OMPI_ERROR; } + module_endpoint->mx_peer_addr = mx_remote_addr; + module_endpoint->status = MCA_BTL_MX_CONNECTED; module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = module_endpoint; return OMPI_SUCCESS; diff --git a/ompi/mca/btl/mx/configure.m4 b/ompi/mca/btl/mx/configure.m4 index 12f032f47f..07c6cb4162 100644 --- a/ompi/mca/btl/mx/configure.m4 +++ b/ompi/mca/btl/mx/configure.m4 @@ -3,7 +3,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2006 The University of Tennessee and The University +# Copyright (c) 2004-2007 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,