1
1

Derived from patch provided by Artem, cleanup the "abnormal" code path for selecting TCP OOB modules to connect to a remote process. If we can't find a direct interface-to-address match, then assign all the provided addresses to the first available TCP module and let the normal failure process determine if the remote proc is truly reachable.

cmr=v1.8.2:reviewer=artpol:subject=fix abnormal code connection path in tcp oob

This commit was SVN r31536.
Этот коммит содержится в:
Ralph Castain 2014-04-28 19:05:14 +00:00
родитель 626b521e9c
Коммит d642babff6

Просмотреть файл

@ -805,7 +805,7 @@ static int component_set_addr(orte_process_name_t *peer,
char **addrs, *hptr;
char *tcpuri=NULL, *host, *ports;
int i, j, k, rc;
mca_oob_tcp_module_t *mod;
mca_oob_tcp_module_t *mod, *firstmod;
mca_oob_tcp_component_peer_t *pr;
uint16_t af_family = AF_UNSPEC;
uint64_t ui64;
@ -950,10 +950,32 @@ static int component_set_addr(orte_process_name_t *peer,
/* if we cycled thru all their addresses without finding a match in our
* interfaces, then it remains possible that they have a routed system
* that can still route messages to the destination. So give it a chance
* to succeed by assigning our first module to try, and let the normal
* failure progression ultimately determine if we can reach this peer
* to succeed by assigning each provided address to the first module in our list,
* and let the normal failure progression ultimately determine if we
* can reach this peer. A future enhancement could be to do a better
* job of matching provided addresses with available interfaces, perhaps
* looking at the number of matching octets and assigning the address
* to the interface with the most matches - but that's for someone else
* to address :-)
*/
if (!assigned) {
firstmod = NULL;
for (k=0; k < mca_oob_tcp_component.modules.size; k++) {
if (NULL == (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
continue;
}
if (NULL == firstmod) {
firstmod = mod;
break;
}
}
if (NULL == firstmod) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
for (j=0; NULL != addrs[j]; j++) {
/* if they gave us "localhost", then just take the first conn on our list */
if (0 == strcasecmp(addrs[j], "localhost")) {
#if OPAL_ENABLE_IPV6
if (AF_INET6 == af_family) {
if (NULL == mca_oob_tcp_component.ipv6conns ||
@ -971,19 +993,8 @@ static int component_set_addr(orte_process_name_t *peer,
#if OPAL_ENABLE_IPV6
}
#endif
/* lookup the kernel index of this address */
if (0 >= (k = opal_ifaddrtokindex(host))) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s UNFOUND KERNEL INDEX %d FOR ADDRESS %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, host);
/* we don't have an interface on this subnet - ignore it */
continue;
}
if (NULL == (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s NO MODULE AT KINDEX %d FOR ADDRESS %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, host);
continue;
} else {
host = addrs[j];
}
/* record that this peer may be reachable via this module, but don't assign
* the peer to this module until later when we actually connect
@ -991,7 +1002,7 @@ static int component_set_addr(orte_process_name_t *peer,
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s PEER %s MAY BE REACHABLE BY ROUTING - ASSIGNING MODULE AT KINDEX %d INTERFACE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer), k, mod->if_name);
ORTE_NAME_PRINT(peer), k, firstmod->if_name);
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_component.peers,
ui64, (void**)&pr) || NULL == pr) {
pr = OBJ_NEW(mca_oob_tcp_component_peer_t);
@ -1012,11 +1023,12 @@ static int component_set_addr(orte_process_name_t *peer,
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s PASSING ADDR %s TO INTERFACE %s AT KERNEL INDEX %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), host,
mod->if_name, k);
mod->api.set_peer((struct mca_oob_tcp_module_t*)mod,
firstmod->if_name, k);
mod->api.set_peer((struct mca_oob_tcp_module_t*)firstmod,
peer, af_family, host, ports);
found = true;
}
}
if (NULL != addrs) {
opal_argv_free(addrs);
}