1
1

Derived from patch provided by Artem, cleanup the "abnormal" code path for selecting TCP OOB modules to connect to a remote process. If we can't find a direct interface-to-address match, then assign all the provided addresses to the first available TCP module and let the normal failure process determine if the remote proc is truly reachable.

cmr=v1.8.2:reviewer=artpol:subject=fix abnormal code connection path in tcp oob

This commit was SVN r31536.
Этот коммит содержится в:
Ralph Castain 2014-04-28 19:05:14 +00:00
родитель 626b521e9c
Коммит d642babff6

Просмотреть файл

@ -805,7 +805,7 @@ static int component_set_addr(orte_process_name_t *peer,
char **addrs, *hptr; char **addrs, *hptr;
char *tcpuri=NULL, *host, *ports; char *tcpuri=NULL, *host, *ports;
int i, j, k, rc; int i, j, k, rc;
mca_oob_tcp_module_t *mod; mca_oob_tcp_module_t *mod, *firstmod;
mca_oob_tcp_component_peer_t *pr; mca_oob_tcp_component_peer_t *pr;
uint16_t af_family = AF_UNSPEC; uint16_t af_family = AF_UNSPEC;
uint64_t ui64; uint64_t ui64;
@ -950,10 +950,32 @@ static int component_set_addr(orte_process_name_t *peer,
/* if we cycled thru all their addresses without finding a match in our /* if we cycled thru all their addresses without finding a match in our
* interfaces, then it remains possible that they have a routed system * interfaces, then it remains possible that they have a routed system
* that can still route messages to the destination. So give it a chance * that can still route messages to the destination. So give it a chance
* to succeed by assigning our first module to try, and let the normal * to succeed by assigning each provided address to the first module in our list,
* failure progression ultimately determine if we can reach this peer * and let the normal failure progression ultimately determine if we
* can reach this peer. A future enhancement could be to do a better
* job of matching provided addresses with available interfaces, perhaps
* looking at the number of matching octets and assigning the address
* to the interface with the most matches - but that's for someone else
* to address :-)
*/ */
if (!assigned) { if (!assigned) {
firstmod = NULL;
for (k=0; k < mca_oob_tcp_component.modules.size; k++) {
if (NULL == (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
continue;
}
if (NULL == firstmod) {
firstmod = mod;
break;
}
}
if (NULL == firstmod) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
for (j=0; NULL != addrs[j]; j++) {
/* if they gave us "localhost", then just take the first conn on our list */
if (0 == strcasecmp(addrs[j], "localhost")) {
#if OPAL_ENABLE_IPV6 #if OPAL_ENABLE_IPV6
if (AF_INET6 == af_family) { if (AF_INET6 == af_family) {
if (NULL == mca_oob_tcp_component.ipv6conns || if (NULL == mca_oob_tcp_component.ipv6conns ||
@ -971,19 +993,8 @@ static int component_set_addr(orte_process_name_t *peer,
#if OPAL_ENABLE_IPV6 #if OPAL_ENABLE_IPV6
} }
#endif #endif
/* lookup the kernel index of this address */ } else {
if (0 >= (k = opal_ifaddrtokindex(host))) { host = addrs[j];
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s UNFOUND KERNEL INDEX %d FOR ADDRESS %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, host);
/* we don't have an interface on this subnet - ignore it */
continue;
}
if (NULL == (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s NO MODULE AT KINDEX %d FOR ADDRESS %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, host);
continue;
} }
/* record that this peer may be reachable via this module, but don't assign /* record that this peer may be reachable via this module, but don't assign
* the peer to this module until later when we actually connect * the peer to this module until later when we actually connect
@ -991,7 +1002,7 @@ static int component_set_addr(orte_process_name_t *peer,
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s PEER %s MAY BE REACHABLE BY ROUTING - ASSIGNING MODULE AT KINDEX %d INTERFACE %s", "%s PEER %s MAY BE REACHABLE BY ROUTING - ASSIGNING MODULE AT KINDEX %d INTERFACE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer), k, mod->if_name); ORTE_NAME_PRINT(peer), k, firstmod->if_name);
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_component.peers, if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_component.peers,
ui64, (void**)&pr) || NULL == pr) { ui64, (void**)&pr) || NULL == pr) {
pr = OBJ_NEW(mca_oob_tcp_component_peer_t); pr = OBJ_NEW(mca_oob_tcp_component_peer_t);
@ -1012,11 +1023,12 @@ static int component_set_addr(orte_process_name_t *peer,
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s PASSING ADDR %s TO INTERFACE %s AT KERNEL INDEX %d", "%s PASSING ADDR %s TO INTERFACE %s AT KERNEL INDEX %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), host, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), host,
mod->if_name, k); firstmod->if_name, k);
mod->api.set_peer((struct mca_oob_tcp_module_t*)mod, mod->api.set_peer((struct mca_oob_tcp_module_t*)firstmod,
peer, af_family, host, ports); peer, af_family, host, ports);
found = true; found = true;
} }
}
if (NULL != addrs) { if (NULL != addrs) {
opal_argv_free(addrs); opal_argv_free(addrs);
} }