1
1

Ensure an orted exits with non-zero status if it is unable to send a message. Add more diagnostic messages to the OOB set_addr code

cmr=v1.7.5:reviewer=jsquyres

This commit was SVN r30701.
Этот коммит содержится в:
Ralph Castain 2014-02-12 19:44:01 +00:00
родитель d0e8aeaee4
Коммит a8a9801a0b
3 изменённых файлов: 71 добавлений и 16 удалений

Просмотреть файл

@ -251,6 +251,8 @@ static void proc_errors(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
"%s errmgr:orted lifeline lost - exiting",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* set our exit status */
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
/* kill our children */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* terminate - our routed children will see

Просмотреть файл

@ -325,6 +325,10 @@ static void process_uri(char *uri)
*/
if (peer.jobid == ORTE_PROC_MY_NAME->jobid &&
peer.vpid == ORTE_PROC_MY_NAME->vpid) {
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s:set_addr peer %s is me",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer));
return;
}
@ -340,7 +344,7 @@ static void process_uri(char *uri)
if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
ORTE_ERROR_LOG(rc);
opal_argv_free(uris);
return;
return;
}
}
@ -352,6 +356,10 @@ static void process_uri(char *uri)
rc = ORTE_ERR_UNREACH;
OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
component = (mca_oob_base_component_t*)cli->cli_component;
opal_output_verbose(5, orte_oob_base_framework.framework_output,
"%s:set_addr checking if peer %s is reachable via component %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
if (NULL != component->set_addr) {
if (ORTE_SUCCESS == component->set_addr(&peer, uris)) {
/* this component found reachable addresses

Просмотреть файл

@ -561,6 +561,10 @@ static bool component_available(void)
*/
if (add_this_nic) {
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:tcp:init creating module for %s address on interface %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(AF_INET == my_ss.ss_family) ? "V4" : "V6", name);
/* we want to support this interface, so create a module for it */
if (ORTE_SUCCESS != (rc = mca_oob_tcp_create(kindex, name))) {
ORTE_ERROR_LOG(rc);
@ -570,13 +574,27 @@ static bool component_available(void)
/* add this address to our connections */
if (AF_INET == my_ss.ss_family) {
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:tcp:init adding %s to our list of %s connections",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
opal_net_get_hostname((struct sockaddr*) &my_ss),
(AF_INET == my_ss.ss_family) ? "V4" : "V6");
opal_argv_append_nosize(&mca_oob_tcp_component.ipv4conns, opal_net_get_hostname((struct sockaddr*) &my_ss));
} else if (AF_INET6 == my_ss.ss_family) {
#if OPAL_ENABLE_IPV6
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:tcp:init adding %s to our list of %s connections",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
opal_net_get_hostname((struct sockaddr*) &my_ss),
(AF_INET == my_ss.ss_family) ? "V4" : "V6");
opal_argv_append_nosize(&mca_oob_tcp_component.ipv6conns, opal_net_get_hostname((struct sockaddr*) &my_ss));
#endif
} else {
opal_output_verbose(10, orte_oob_base_framework.framework_output,
"%s oob:tcp:init ignoring %s from out list of connections",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
opal_net_get_hostname((struct sockaddr*) &my_ss));
}
}
/* cleanup */
@ -776,7 +794,7 @@ static int component_set_addr(orte_process_name_t *peer,
{
char **addrs, *hptr;
char *tcpuri=NULL, *host, *ports;
int i, j, k, rc;
int i, j, k, n, rc;
mca_oob_tcp_module_t *mod;
mca_oob_tcp_component_peer_t *pr;
uint16_t af_family = AF_UNSPEC;
@ -849,23 +867,50 @@ static int component_set_addr(orte_process_name_t *peer,
/* cycle across the provided addrs */
for (j=0; NULL != addrs[j]; j++) {
/* lookup the kernel index of this address */
if (0 >= (k = opal_ifaddrtokindex(addrs[j]))) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s UNFOUND KERNEL INDEX %d FOR ADDRESS %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, addrs[j]);
/* we don't have an interface on this subnet - ignore it */
continue;
}
if (NULL == (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s NO MODULE AT KINDEX %d FOR ADDRESS %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, addrs[j]);
continue;
/* if they gave us "localhost", then just take our lowest kernel index interface */
if (0 == strcasecmp(addrs[j], "localhost")) {
n = opal_ifbegin();
mod = NULL;
while (0 <= n) {
k = opal_ifindextokindex(n);
if (NULL != (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s USING MODULE AT KINDEX %d FOR LOCALHOST",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k);
break;
}
n = opal_ifnext(n);
}
if (NULL == mod) {
/* should never happen */
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s NO MODULE FOR LOCALHOST",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
continue;
}
} else {
/* lookup the kernel index of this address */
if (0 >= (k = opal_ifaddrtokindex(addrs[j]))) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s UNFOUND KERNEL INDEX %d FOR ADDRESS %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, addrs[j]);
/* we don't have an interface on this subnet - ignore it */
continue;
}
if (NULL == (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s NO MODULE AT KINDEX %d FOR ADDRESS %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, addrs[j]);
continue;
}
}
/* record that this peer may be reachable via this module, but don't assign
* the peer to this module until later when we actually connect
*/
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
"%s PEER %s MAY BE REACHABLE USING MODULE AT KINDEX %d INTERFACE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(peer), k, mod->if_name);
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_component.peers,
ui64, (void**)&pr) || NULL == pr) {
pr = OBJ_NEW(mca_oob_tcp_component_peer_t);