diff --git a/opal/mca/btl/tcp/btl_tcp.h b/opal/mca/btl/tcp/btl_tcp.h index 846ee3b7ca..a3ca945e9c 100644 --- a/opal/mca/btl/tcp/btl_tcp.h +++ b/opal/mca/btl/tcp/btl_tcp.h @@ -164,13 +164,11 @@ OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component; struct mca_btl_tcp_module_t { mca_btl_base_module_t super; /**< base BTL interface */ uint16_t tcp_ifkindex; /** */ -#else - /* Bug, FIXME: needs testing */ - struct my_in6_addr { - union { - uint32_t u6_addr32[4]; - struct _my_in6_addr { - struct in_addr _addr_inet; - uint32_t _pad[3]; - } _addr__inet; - } _union_inet; - } addr_inet; + struct in6_addr addr_inet6; /* IPv6 listen address */ #endif - in_port_t addr_port; /**< listen port */ - uint16_t addr_ifkindex; /**< remote interface index assigned with + }; + in_port_t addr_port; /**< listen port */ + int addr_ifkindex; /**< remote interface index assigned with this address */ - unsigned short addr_inuse; /**< local meaning only */ - uint8_t addr_family; /**< AF_INET or AF_INET6 */ + uint8_t addr_family; /**< AF_INET or AF_INET6 */ + bool addr_inuse; /**< local meaning only */ }; typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t; #define MCA_BTL_TCP_AF_INET 0 -#if OPAL_ENABLE_IPV6 -# define MCA_BTL_TCP_AF_INET6 1 -#endif +#define MCA_BTL_TCP_AF_INET6 1 #endif diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 85a472b82b..422db48a16 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -490,11 +490,59 @@ static int mca_btl_tcp_component_close(void) * Create a btl instance and add to modules list. */ -static int mca_btl_tcp_create(int if_kindex, const char* if_name) +static int mca_btl_tcp_create(const int if_kindex, const char* if_name) { struct mca_btl_tcp_module_t* btl; char param[256]; int i; + struct sockaddr_storage addr; + bool found = false; + + /* + * Look for an address on the given device (ie, kindex) which + * isn't disabled by the disable_family option. If there isn't + * one, skip creating the modules for this interface. We store + * the address on the module both to publish in the modex and to + * use as the source address of all packets sent by this module. + * + * This still isn't quite right. Better would be to pull apart + * split_and_resolve and pass the address used to select the + * device into mca_btl_tcp_create(). This is a cleanup of the + * logic that's been in use for years, but the case it doesn't + * cover is (say) only specifing mca_btl_if_include 10.0.0.0/16 + * when the interface has addresses of both 10.0.0.1 and 10.1.0.1; + * there's absolutely nothing that keeps this code from picking + * 10.1.0.1 as the one that is published in the modex and used for + * connection. + */ + for (i = opal_ifbegin() ; i >= 0 ; i = opal_ifnext(i)) { + int ret; + + if (if_kindex != opal_ifindextokindex(i)) { + continue; + } + + ret = opal_ifindextoaddr(i, (struct sockaddr*)&addr, + sizeof(struct sockaddr_storage)); + if (OPAL_SUCCESS != ret) { + return ret; + } + + if (addr.ss_family == AF_INET && + 4 != mca_btl_tcp_component.tcp_disable_family) { + found = true; + break; + } else if (addr.ss_family == AF_INET6 && + 6 != mca_btl_tcp_component.tcp_disable_family) { + found = true; + break; + } + } + /* if we didn't find an address that works for us on this + interface, just move on. */ + if (!found) { + return OPAL_SUCCESS; + } for( i = 0; i < (int)mca_btl_tcp_component.tcp_num_links; i++ ) { btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t)); @@ -513,17 +561,8 @@ static int mca_btl_tcp_create(int if_kindex, const char* if_name) btl->tcp_send_handler = 0; #endif - struct sockaddr_storage addr; - opal_ifkindextoaddr(if_kindex, (struct sockaddr*) &addr, - sizeof (struct sockaddr_storage)); -#if OPAL_ENABLE_IPV6 - if (addr.ss_family == AF_INET6) { - btl->tcp_ifaddr_6 = addr; - } -#endif - if (addr.ss_family == AF_INET) { - btl->tcp_ifaddr = addr; - } + memcpy(&btl->tcp_ifaddr, &addr, sizeof(struct sockaddr_storage)); + /* allow user to specify interface bandwidth */ sprintf(param, "bandwidth_%s", if_name); mca_btl_tcp_param_register_uint(param, NULL, btl->super.btl_bandwidth, OPAL_INFO_LVL_5, &btl->super.btl_bandwidth); @@ -567,8 +606,8 @@ static int mca_btl_tcp_create(int if_kindex, const char* if_name) opal_output_verbose(5, opal_btl_base_framework.framework_output, "btl:tcp: %p: if %s kidx %d cnt %i addr %s %s bw %d lt %d\n", (void*)btl, if_name, (int) btl->tcp_ifkindex, i, - opal_net_get_hostname((struct sockaddr*)&addr), - (addr.ss_family == AF_INET) ? "IPv4" : "IPv6", + opal_net_get_hostname((struct sockaddr*)&btl->tcp_ifaddr), + (btl->tcp_ifaddr.ss_family == AF_INET) ? "IPv4" : "IPv6", btl->super.btl_bandwidth, btl->super.btl_latency); } return OPAL_SUCCESS; @@ -721,6 +760,7 @@ static int mca_btl_tcp_component_create_instances(void) */ for(if_index = opal_ifbegin(); if_index >= 0; if_index = opal_ifnext(if_index)){ int index = opal_ifindextokindex (if_index); + if (index > 0) { bool want_this_if = true; @@ -1121,97 +1161,64 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family) static int mca_btl_tcp_component_exchange(void) { - int rc = 0, index; - size_t i = 0; - size_t size = mca_btl_tcp_component.tcp_addr_count * - mca_btl_tcp_component.tcp_num_links * sizeof(mca_btl_tcp_addr_t); - /* adi@2007-04-12: - * - * We'll need to explain things a bit here: - * 1. We normally have as many BTLs as physical NICs. - * 2. With num_links, we now have num_btl = num_links * #NICs - * 3. we might have more than one address per NIC - */ - size_t xfer_size = 0; /* real size to transfer (may differ from 'size') */ - size_t current_addr = 0; + int rc; + size_t i; + size_t num_btls = mca_btl_tcp_component.tcp_num_btls; + size_t size = num_btls * sizeof(mca_btl_tcp_modex_addr_t); + mca_btl_tcp_modex_addr_t *addrs; - if(mca_btl_tcp_component.tcp_num_btls != 0) { - char ifn[32]; - mca_btl_tcp_addr_t *addrs = (mca_btl_tcp_addr_t *)malloc(size); - memset(addrs, 0, size); + if (num_btls <= 0) { + return 0; + } - /* here we start populating our addresses */ - for( i = 0; i < mca_btl_tcp_component.tcp_num_btls; i++ ) { - for (index = opal_ifbegin(); index >= 0; - index = opal_ifnext(index)) { - struct sockaddr_storage my_ss; + addrs = (mca_btl_tcp_modex_addr_t*)malloc(size); + if (NULL == addrs) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + memset(addrs, 0, size); - /* Look if the module's address belongs to this - * kernel IP interface. If not, go to next address. - */ - if (opal_ifindextokindex (index) != - mca_btl_tcp_component.tcp_btls[i]->tcp_ifkindex) { - continue; - } - - opal_ifindextoname(index, ifn, sizeof(ifn)); - opal_output_verbose(30, opal_btl_base_framework.framework_output, - "btl: tcp: component_exchange: examining interface %s", - ifn); - if (OPAL_SUCCESS != - opal_ifindextoaddr(index, (struct sockaddr*) &my_ss, - sizeof (my_ss))) { - opal_output (0, - "btl_tcp_component: problems getting address for index %i (kernel index %i)\n", - index, opal_ifindextokindex (index)); - continue; - } + for (i = 0 ; i < num_btls ; i++) { + struct mca_btl_tcp_module_t *btl = mca_btl_tcp_component.tcp_btls[i]; + struct sockaddr *addr = (struct sockaddr*)&(btl->tcp_ifaddr); #if OPAL_ENABLE_IPV6 - if ((AF_INET6 == my_ss.ss_family) && - (6 != mca_btl_tcp_component.tcp_disable_family)) { - memcpy(&addrs[current_addr].addr_inet, - &((struct sockaddr_in6*)&my_ss)->sin6_addr, - sizeof(addrs[0].addr_inet)); - addrs[current_addr].addr_port = - mca_btl_tcp_component.tcp6_listen_port; - addrs[current_addr].addr_family = MCA_BTL_TCP_AF_INET6; - xfer_size += sizeof (mca_btl_tcp_addr_t); - addrs[current_addr].addr_inuse = 0; - addrs[current_addr].addr_ifkindex = - opal_ifindextokindex (index); - current_addr++; - opal_output_verbose(30, opal_btl_base_framework.framework_output, - "btl: tcp: component_exchange: " - "%s IPv6 %s", ifn, - opal_net_get_hostname((struct sockaddr*)&my_ss)); - } else + if (AF_INET6 == addr->sa_family) { + struct sockaddr_in6 *inaddr6 = (struct sockaddr_in6*)addr; + + memcpy(&addrs[i].addr, &(inaddr6->sin6_addr), + sizeof(struct in6_addr)); + addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port; + addrs[i].addr_ifkindex = btl->tcp_ifkindex; + addrs[i].addr_family = MCA_BTL_TCP_AF_INET6; + opal_output_verbose(5, opal_btl_base_framework.framework_output, + "btl: tcp: exchange: %d %d IPv6 %s", + (int)i, btl->tcp_ifkindex, + opal_net_get_hostname(addr)); + } else #endif - if ((AF_INET == my_ss.ss_family) && - (4 != mca_btl_tcp_component.tcp_disable_family)) { - memcpy(&addrs[current_addr].addr_inet, - &((struct sockaddr_in*)&my_ss)->sin_addr, - sizeof(struct in_addr)); - addrs[current_addr].addr_port = - mca_btl_tcp_component.tcp_listen_port; - addrs[current_addr].addr_family = MCA_BTL_TCP_AF_INET; - xfer_size += sizeof (mca_btl_tcp_addr_t); - addrs[current_addr].addr_inuse = 0; - addrs[current_addr].addr_ifkindex = - opal_ifindextokindex (index); - current_addr++; - opal_output_verbose(30, opal_btl_base_framework.framework_output, - "btl: tcp: component_exchange: " - "%s IPv4 %s", ifn, - opal_net_get_hostname((struct sockaddr*)&my_ss)); - } - } /* end of for opal_ifbegin() */ - } /* end of for tcp_num_btls */ - OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, - &mca_btl_tcp_component.super.btl_version, - addrs, xfer_size); - free(addrs); - } /* end if */ + if (AF_INET == addr->sa_family) { + struct sockaddr_in *inaddr = (struct sockaddr_in*)addr; + + memcpy(&addrs[i].addr, &(inaddr->sin_addr), + sizeof(struct in_addr)); + addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port; + addrs[i].addr_ifkindex = btl->tcp_ifkindex; + addrs[i].addr_family = MCA_BTL_TCP_AF_INET; + opal_output_verbose(5, opal_btl_base_framework.framework_output, + "btl: tcp: exchange: %d %d IPv4 %s", + (int)i, btl->tcp_ifkindex, + opal_net_get_hostname(addr)); + } else { + BTL_ERROR(("Unexpected address family: %d", addr->sa_family)); + return OPAL_ERR_BAD_PARAM; + } + } + + OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, + &mca_btl_tcp_component.super.btl_version, + addrs, size); + free(addrs); + return rc; } diff --git a/opal/mca/btl/tcp/btl_tcp_endpoint.c b/opal/mca/btl/tcp/btl_tcp_endpoint.c index 1ead5f8fb7..0cc8537aab 100644 --- a/opal/mca/btl/tcp/btl_tcp_endpoint.c +++ b/opal/mca/btl/tcp/btl_tcp_endpoint.c @@ -743,11 +743,11 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo #if OPAL_ENABLE_IPV6 if (endpoint_addr.ss_family == AF_INET6) { assert(NULL != &btl_endpoint->endpoint_btl->tcp_ifaddr_6); - if (bind(btl_endpoint->endpoint_sd, (struct sockaddr*) &btl_endpoint->endpoint_btl->tcp_ifaddr_6, + if (bind(btl_endpoint->endpoint_sd, (struct sockaddr*) &btl_endpoint->endpoint_btl->tcp_ifaddr, sizeof(struct sockaddr_in6)) < 0) { BTL_ERROR(("bind on local address (%s:%d) failed: %s (%d)", opal_net_get_hostname((struct sockaddr*) &btl_endpoint->endpoint_btl->tcp_ifaddr), - htons(((struct sockaddr_in*)&btl_endpoint->endpoint_btl->tcp_ifaddr)->sin_port), + htons(((struct sockaddr_in6*)&btl_endpoint->endpoint_btl->tcp_ifaddr)->sin6_port), strerror(opal_socket_errno), opal_socket_errno)); CLOSE_THE_SOCKET(btl_endpoint->endpoint_sd); diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index a1bec00b82..e84a26cbf5 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -117,73 +117,99 @@ void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc) mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) { mca_btl_tcp_proc_t* btl_proc; - size_t size; int rc; + mca_btl_tcp_modex_addr_t *remote_addrs = NULL; + size_t i, size; OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); rc = opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs, proc->proc_name, (void**)&btl_proc); - if(OPAL_SUCCESS == rc) { + if (OPAL_SUCCESS == rc) { OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); return btl_proc; } - do { /* This loop is only necessary so that we can break out of the serial code */ - btl_proc = OBJ_NEW(mca_btl_tcp_proc_t); - if(NULL == btl_proc) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - break; + /* proc was not found, so create one */ + btl_proc = OBJ_NEW(mca_btl_tcp_proc_t); + if (NULL == btl_proc) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + goto cleanup; + } + + /* Retain the proc, but don't store the ref into the btl_proc just yet. This + * provides a way to release the btl_proc in case of failure without having to + * unlock the mutex. + */ + OBJ_RETAIN(proc); + + /* lookup tcp parameters exported by this proc */ + OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version, + &proc->proc_name, (uint8_t**)&remote_addrs, &size); + if (OPAL_SUCCESS != rc) { + if (OPAL_ERR_NOT_FOUND != rc) { + BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc)); } + goto cleanup; + } - /* Retain the proc, but don't store the ref into the btl_proc just yet. This - * provides a way to release the btl_proc in case of failure without having to - * unlock the mutex. - */ - OBJ_RETAIN(proc); + if (0 != (size % sizeof(mca_btl_tcp_modex_addr_t))) { + BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n", + (unsigned long)size, + (unsigned long)sizeof(mca_btl_tcp_modex_addr_t))); + rc = OPAL_ERROR; + goto cleanup; + } - /* lookup tcp parameters exported by this proc */ - OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version, - &proc->proc_name, (uint8_t**)&btl_proc->proc_addrs, &size); - if(rc != OPAL_SUCCESS) { - if(OPAL_ERR_NOT_FOUND != rc) - BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc)); - break; - } + btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_modex_addr_t); + btl_proc->proc_addrs = malloc(btl_proc->proc_addr_count * + sizeof(mca_btl_tcp_addr_t)); + if (NULL == btl_proc->proc_addrs) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + goto cleanup; + } - if(0 != (size % sizeof(mca_btl_tcp_addr_t))) { - BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n", - (unsigned long) size, (unsigned long)sizeof(mca_btl_tcp_addr_t))); - rc = OPAL_ERROR; - break; - } - - btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_addr_t); - - /* allocate space for endpoint array - one for each exported address */ - btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**) - malloc((1 + btl_proc->proc_addr_count) * - sizeof(mca_btl_base_endpoint_t*)); - if(NULL == btl_proc->proc_endpoints) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - break; - } - - /* convert the OPAL addr_family field to OS constants, - * so we can check for AF_INET (or AF_INET6) and don't have - * to deal with byte ordering anymore. - */ - for (unsigned int i = 0; i < btl_proc->proc_addr_count; i++) { - if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) { - btl_proc->proc_addrs[i].addr_family = AF_INET; - } + /* the modex and proc structures differ slightly, so copy the + fields needed in the proc version */ + for (i = 0 ; i < btl_proc->proc_addr_count ; i++) { + if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) { + memcpy(&btl_proc->proc_addrs[i].addr_inet, + remote_addrs[i].addr, sizeof(struct in_addr)); + btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port; + btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex; + btl_proc->proc_addrs[i].addr_family = AF_INET; + btl_proc->proc_addrs[i].addr_inuse = false; + } else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) { #if OPAL_ENABLE_IPV6 - if (MCA_BTL_TCP_AF_INET6 == btl_proc->proc_addrs[i].addr_family) { - btl_proc->proc_addrs[i].addr_family = AF_INET6; - } + memcpy(&btl_proc->proc_addrs[i].addr_inet6, + remote_addrs[i].addr, sizeof(struct in6_addr)); + btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port; + btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex; + btl_proc->proc_addrs[i].addr_family = AF_INET6; + btl_proc->proc_addrs[i].addr_inuse = false; +#else + rc = OPAL_ERR_NOT_SUPPORTED; + goto cleanup; #endif + } else { + BTL_ERROR(("Unexpected address family %d", + (int)remote_addrs[i].addr_family)); + rc = OPAL_ERR_BAD_PARAM; + goto cleanup; } - } while (0); + } + free(remote_addrs); + + /* allocate space for endpoint array - one for each exported address */ + btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**) + malloc((1 + btl_proc->proc_addr_count) * + sizeof(mca_btl_base_endpoint_t*)); + if (NULL == btl_proc->proc_endpoints) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + goto cleanup; + } + +cleanup: if (OPAL_SUCCESS == rc) { btl_proc->proc_opal = proc; /* link with the proc */ /* add to hash table of all proc instance. */ @@ -669,7 +695,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc, } peer_interfaces[best]->inuse++; btl_endpoint->endpoint_addr = proc_data->best_addr[i][best]; - btl_endpoint->endpoint_addr->addr_inuse++; + btl_endpoint->endpoint_addr->addr_inuse = true; rc = OPAL_SUCCESS; break; } @@ -695,7 +721,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc, if (CQ_NO_CONNECTION != max) { peer_interfaces[j_max]->inuse++; btl_endpoint->endpoint_addr = proc_data->best_addr[i_max][j_max]; - btl_endpoint->endpoint_addr->addr_inuse++; + btl_endpoint->endpoint_addr->addr_inuse = true; rc = OPAL_SUCCESS; } } @@ -772,7 +798,7 @@ int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_ being removed early in the wireup sequence (e.g., if it is unreachable by all other procs) */ if (NULL != btl_endpoint->endpoint_addr) { - btl_endpoint->endpoint_addr->addr_inuse--; + btl_endpoint->endpoint_addr->addr_inuse = false; } break; }