1
1
openmpi/ompi/mca/btl/tcp/btl_tcp_proc.c
Brian Barrett 8b9e8054fd Move modex from pml base to general ompi runtime, sicne it's used by more
than just the PML/BTLs these days.  Also clean up the code so that it
handles the situation where not all nodes register information for a given
node (rather than just spinning until that node sends information, like
we do today).

Includes r15234 and r15265 from the /tmp/bwb-modex branch.

This commit was SVN r15310.

The following SVN revisions from the original message are invalid or
inconsistent and therefore were not cross-referenced:
  r15234
  r15265
2007-07-09 17:16:34 +00:00

404 строки
14 KiB
C

/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#ifdef HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#include "orte/class/orte_proc_table.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "ompi/runtime/ompi_module_exchange.h"
#include "ompi/datatype/dt_arch.h"
#include "opal/util/if.h"
#include "opal/util/net.h"
#include "orte/mca/oob/tcp/oob_tcp_addr.h"
#include "btl_tcp.h"
#include "btl_tcp_proc.h"
static void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* proc);
static void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* proc);
OBJ_CLASS_INSTANCE( mca_btl_tcp_proc_t,
opal_list_item_t,
mca_btl_tcp_proc_construct,
mca_btl_tcp_proc_destruct );
void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* proc)
{
proc->proc_ompi = 0;
proc->proc_addrs = NULL;
proc->proc_addr_count = 0;
proc->proc_endpoints = NULL;
proc->proc_endpoint_count = 0;
OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
}
/*
* Cleanup ib proc instance
*/
void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* proc)
{
/* remove from list of all proc instances */
OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
orte_hash_table_remove_proc(&mca_btl_tcp_component.tcp_procs, &proc->proc_name);
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
/* release resources */
if(NULL != proc->proc_endpoints) {
free(proc->proc_endpoints);
OBJ_DESTRUCT(&proc->proc_lock);
}
}
/*
* Create a TCP process structure. There is a one-to-one correspondence
* between a ompi_proc_t and a mca_btl_tcp_proc_t instance. We cache
* additional data (specifically the list of mca_btl_tcp_endpoint_t instances,
* and published addresses) associated w/ a given destination on this
* datastructure.
*/
mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(ompi_proc_t* ompi_proc)
{
int rc;
size_t size;
mca_btl_tcp_proc_t* btl_proc;
OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
btl_proc = (mca_btl_tcp_proc_t*)orte_hash_table_get_proc(
&mca_btl_tcp_component.tcp_procs, &ompi_proc->proc_name);
if(NULL != btl_proc) {
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
return btl_proc;
}
btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
if(NULL == btl_proc)
return NULL;
btl_proc->proc_ompi = ompi_proc;
btl_proc->proc_name = ompi_proc->proc_name;
/* add to hash table of all proc instance */
orte_hash_table_set_proc( &mca_btl_tcp_component.tcp_procs,
&btl_proc->proc_name,
btl_proc );
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
/* lookup tcp parameters exported by this proc */
rc = ompi_modex_recv( &mca_btl_tcp_component.super.btl_version,
ompi_proc,
(void**)&btl_proc->proc_addrs,
&size );
if(rc != OMPI_SUCCESS) {
BTL_ERROR(("mca_base_modex_recv: failed with return value=%d", rc));
OBJ_RELEASE(btl_proc);
return NULL;
}
if(0 != (size % sizeof(mca_btl_tcp_addr_t))) {
BTL_ERROR(("mca_base_modex_recv: invalid size %d: btl-size: %d\n",
size, sizeof(mca_btl_tcp_addr_t)));
return NULL;
}
btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_addr_t);
/* allocate space for endpoint array - one for each exported address */
btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc((1 + btl_proc->proc_addr_count) *
sizeof(mca_btl_base_endpoint_t*));
if(NULL == btl_proc->proc_endpoints) {
OBJ_RELEASE(btl_proc);
return NULL;
}
if(NULL == mca_btl_tcp_component.tcp_local && ompi_proc == ompi_proc_local()) {
mca_btl_tcp_component.tcp_local = btl_proc;
}
{
/* convert the OMPI addr_family field to OS constants,
* so we can check for AF_INET (or AF_INET6) and don't have
* to deal with byte ordering anymore.
*/
unsigned int i;
for (i = 0; i < btl_proc->proc_addr_count; i++) {
if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) {
btl_proc->proc_addrs[i].addr_family = AF_INET;
}
#if OPAL_WANT_IPV6
if (MCA_BTL_TCP_AF_INET6 == btl_proc->proc_addrs[i].addr_family) {
btl_proc->proc_addrs[i].addr_family = AF_INET6;
}
#endif
}
}
return btl_proc;
}
/*
* Note that this routine must be called with the lock on the process
* already held. Insert a btl instance into the proc array and assign
* it an address.
*/
int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
mca_btl_base_endpoint_t* btl_endpoint )
{
size_t i;
struct sockaddr_storage endpoint_addr_ss;
#ifndef WORDS_BIGENDIAN
/* if we are little endian and our peer is not so lucky, then we
need to put all information sent to him in big endian (aka
Network Byte Order) and expect all information received to
be in NBO. Since big endian machines always send and receive
in NBO, we don't care so much about that case. */
if (btl_proc->proc_ompi->proc_arch & OMPI_ARCH_ISBIGENDIAN) {
btl_endpoint->endpoint_nbo = true;
}
#endif
/* insert into endpoint array */
btl_endpoint->endpoint_proc = btl_proc;
btl_proc->proc_endpoints[btl_proc->proc_endpoint_count++] = btl_endpoint;
/*
* Look through the proc instance for an address that is on the
* directly attached network. If we don't find one, pick the first
* unused address.
*/
for( i = 0; i < btl_proc->proc_addr_count; i++ ) {
mca_btl_tcp_addr_t* endpoint_addr = btl_proc->proc_addrs + i;
if(endpoint_addr->addr_inuse != 0) {
continue;
}
mca_btl_tcp_proc_tosocks (endpoint_addr, &endpoint_addr_ss);
/* The best we could get is IPv4 public. So let's check */
if (true == opal_net_addr_isipv4public((struct sockaddr*) &endpoint_addr_ss)) {
btl_endpoint->endpoint_addr = endpoint_addr;
btl_endpoint->endpoint_addr->addr_inuse++;
return OMPI_SUCCESS;
}
#if OPAL_WANT_IPV6
/* Bug, FIXME: this is Thomas' job: if we have IPv6 AND RFC1918,
* use IPv6, else use IPv4 private */
/*
* adi@2006-11-22: new bug. It's not sufficient to look for
* remote's IPv6 capabilities, we even need to check if we're
* able to communicate via IPv6. We might also want to look
* at mca_btl_tcp_component.tcp_disable_family
*/
if((AF_INET6 == endpoint_addr->addr_family) &&
(6 != mca_btl_tcp_component.tcp_disable_family)) {
btl_endpoint->endpoint_addr = endpoint_addr;
btl_endpoint->endpoint_addr->addr_inuse++;
return OMPI_SUCCESS;
}
#endif
/* Read:
* if we are on the same network, accept.
* Bug, FIXME. May be wrong. That's only a
* last resort
*/
/* loop over our local addresses and see if we could accept it */
{
int index;
struct sockaddr_storage local_ss;
uint32_t netmask;
for (index = opal_ifbegin(); index >= 0; index=opal_ifnext (index)) {
/* we're only looking for IPv4 (private) */
if (AF_INET != endpoint_addr->addr_family) {
continue;
}
if (OPAL_SUCCESS !=
opal_ifindextoaddr (index, (struct sockaddr*) &local_ss, sizeof (local_ss))) {
opal_output (0,
"btl_tcp_proc: problems getting address for index %i (kernel index %i)\n", index, opal_ifindextokindex (index));
continue;
}
if (OPAL_SUCCESS !=
opal_ifindextomask (index, &netmask, sizeof (netmask))) {
opal_output (0,
"btl_tcp_proc: problems getting netmask for index %i (kernel index %i)\n", index, opal_ifindextokindex (index));
continue;
}
/* we know that we're only talking about IPv4 now.
* Let's talk about IPv4 _private_, so isipv4public must
* return false
*/
if (false == opal_net_addr_isipv4public((struct sockaddr*) &local_ss)) {
if (opal_net_samenetwork((struct sockaddr*) &local_ss,
(struct sockaddr*) &endpoint_addr_ss,
netmask)) {
btl_endpoint->endpoint_addr = endpoint_addr;
btl_endpoint->endpoint_addr->addr_inuse++;
return OMPI_SUCCESS;
}
}
}
}
} /* end of for btl_proc_proc_addr_count */
/* Make sure there is a common interface */
if( NULL != btl_endpoint->endpoint_addr ) {
btl_endpoint->endpoint_addr->addr_inuse++;
return OMPI_SUCCESS;
}
/* Bug, FIXME: Once upon a time, there was a lot of
* code in here. I've removed it. There might be better
* approaches. Thomas will show...
*/
return OMPI_ERR_UNREACH;
}
/*
* Remove an endpoint from the proc array and indicate the address is
* no longer in use.
*/
int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_t* btl_endpoint)
{
size_t i;
OPAL_THREAD_LOCK(&btl_proc->proc_lock);
for(i=0; i<btl_proc->proc_endpoint_count; i++) {
if(btl_proc->proc_endpoints[i] == btl_endpoint) {
memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1,
(btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*));
if(--btl_proc->proc_endpoint_count == 0) {
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
OBJ_RELEASE(btl_proc);
return OMPI_SUCCESS;
}
/* The endpoint_addr may still be NULL if this enpoint is
being removed early in the wireup sequence (e.g., if it
is unreachable by all other procs) */
if (NULL != btl_endpoint->endpoint_addr) {
btl_endpoint->endpoint_addr->addr_inuse--;
}
break;
}
}
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
return OMPI_SUCCESS;
}
/*
* Look for an existing TCP process instance based on the globally unique
* process identifier.
*/
mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const orte_process_name_t *name)
{
mca_btl_tcp_proc_t* proc;
OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
proc = (mca_btl_tcp_proc_t*)orte_hash_table_get_proc(
&mca_btl_tcp_component.tcp_procs, name);
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
return proc;
}
/*
* loop through all available BTLs for one matching the source address
* of the request.
*/
bool mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr, int sd)
{
size_t i;
OPAL_THREAD_LOCK(&btl_proc->proc_lock);
for( i = 0; i < btl_proc->proc_endpoint_count; i++ ) {
mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
/* Check all conditions before going to try to accept the connection. */
if( btl_endpoint->endpoint_addr->addr_family != addr->sa_family ) {
continue;
}
switch (addr->sa_family) {
case AF_INET:
if( memcmp( &btl_endpoint->endpoint_addr->addr_inet,
&(((struct sockaddr_in*)addr)->sin_addr),
sizeof(struct in_addr) ) ) {
continue;
}
break;
#if OPAL_WANT_IPV6
case AF_INET6:
if( memcmp( &btl_endpoint->endpoint_addr->addr_inet,
&(((struct sockaddr_in6*)addr)->sin6_addr),
sizeof(struct in6_addr) ) ) {
continue;
}
break;
#endif
default:
;
}
if(mca_btl_tcp_endpoint_accept(btl_endpoint, addr, sd)) {
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
return true;
}
}
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
return false;
}
/*
* convert internal data structure (mca_btl_tcp_addr_t) to sockaddr_storage
*
*/
bool mca_btl_tcp_proc_tosocks(mca_btl_tcp_addr_t* proc_addr,
struct sockaddr_storage* output)
{
memset(output, 0, sizeof (*output));
switch (proc_addr->addr_family) {
case AF_INET:
output->ss_family = AF_INET;
memcpy(&((struct sockaddr_in*)output)->sin_addr,
&proc_addr->addr_inet, sizeof(struct in_addr));
((struct sockaddr_in*)output)->sin_port = proc_addr->addr_port;
break;
#if OPAL_WANT_IPV6
case AF_INET6:
{
struct sockaddr_in6* inaddr = (struct sockaddr_in6*)output;
output->ss_family = AF_INET6;
memcpy(&inaddr->sin6_addr, &proc_addr->addr_inet,
sizeof (proc_addr->addr_inet));
inaddr->sin6_port = proc_addr->addr_port;
inaddr->sin6_scope_id = 0;
inaddr->sin6_flowinfo = 0;
}
break;
#endif
default:
opal_output( 0, "mca_btl_tcp_proc: unknown af_family received: %d\n",
proc_addr->addr_family );
return false;
}
return true;
}