Ensure an orted exits with non-zero status if it is unable to send a message. Add more diagnostic messages to the OOB set_addr code
cmr=v1.7.5:reviewer=jsquyres This commit was SVN r30701.
Этот коммит содержится в:
родитель
d0e8aeaee4
Коммит
a8a9801a0b
@ -251,6 +251,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:orted lifeline lost - exiting",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* set our exit status */
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
/* kill our children */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
/* terminate - our routed children will see
|
||||
|
@ -325,6 +325,10 @@ static void process_uri(char *uri)
|
||||
*/
|
||||
if (peer.jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
peer.vpid == ORTE_PROC_MY_NAME->vpid) {
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
"%s:set_addr peer %s is me",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&peer));
|
||||
return;
|
||||
}
|
||||
|
||||
@ -340,7 +344,7 @@ static void process_uri(char *uri)
|
||||
if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, (void*)pr))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_argv_free(uris);
|
||||
return;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@ -352,6 +356,10 @@ static void process_uri(char *uri)
|
||||
rc = ORTE_ERR_UNREACH;
|
||||
OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) {
|
||||
component = (mca_oob_base_component_t*)cli->cli_component;
|
||||
opal_output_verbose(5, orte_oob_base_framework.framework_output,
|
||||
"%s:set_addr checking if peer %s is reachable via component %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&peer), component->oob_base.mca_component_name);
|
||||
if (NULL != component->set_addr) {
|
||||
if (ORTE_SUCCESS == component->set_addr(&peer, uris)) {
|
||||
/* this component found reachable addresses
|
||||
|
@ -561,6 +561,10 @@ static bool component_available(void)
|
||||
*/
|
||||
|
||||
if (add_this_nic) {
|
||||
opal_output_verbose(10, orte_oob_base_framework.framework_output,
|
||||
"%s oob:tcp:init creating module for %s address on interface %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(AF_INET == my_ss.ss_family) ? "V4" : "V6", name);
|
||||
/* we want to support this interface, so create a module for it */
|
||||
if (ORTE_SUCCESS != (rc = mca_oob_tcp_create(kindex, name))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -570,13 +574,27 @@ static bool component_available(void)
|
||||
|
||||
/* add this address to our connections */
|
||||
if (AF_INET == my_ss.ss_family) {
|
||||
opal_output_verbose(10, orte_oob_base_framework.framework_output,
|
||||
"%s oob:tcp:init adding %s to our list of %s connections",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
opal_net_get_hostname((struct sockaddr*) &my_ss),
|
||||
(AF_INET == my_ss.ss_family) ? "V4" : "V6");
|
||||
opal_argv_append_nosize(&mca_oob_tcp_component.ipv4conns, opal_net_get_hostname((struct sockaddr*) &my_ss));
|
||||
} else if (AF_INET6 == my_ss.ss_family) {
|
||||
#if OPAL_ENABLE_IPV6
|
||||
opal_output_verbose(10, orte_oob_base_framework.framework_output,
|
||||
"%s oob:tcp:init adding %s to our list of %s connections",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
opal_net_get_hostname((struct sockaddr*) &my_ss),
|
||||
(AF_INET == my_ss.ss_family) ? "V4" : "V6");
|
||||
opal_argv_append_nosize(&mca_oob_tcp_component.ipv6conns, opal_net_get_hostname((struct sockaddr*) &my_ss));
|
||||
#endif
|
||||
} else {
|
||||
opal_output_verbose(10, orte_oob_base_framework.framework_output,
|
||||
"%s oob:tcp:init ignoring %s from out list of connections",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
opal_net_get_hostname((struct sockaddr*) &my_ss));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
@ -776,7 +794,7 @@ static int component_set_addr(orte_process_name_t *peer,
|
||||
{
|
||||
char **addrs, *hptr;
|
||||
char *tcpuri=NULL, *host, *ports;
|
||||
int i, j, k, rc;
|
||||
int i, j, k, n, rc;
|
||||
mca_oob_tcp_module_t *mod;
|
||||
mca_oob_tcp_component_peer_t *pr;
|
||||
uint16_t af_family = AF_UNSPEC;
|
||||
@ -849,23 +867,50 @@ static int component_set_addr(orte_process_name_t *peer,
|
||||
|
||||
/* cycle across the provided addrs */
|
||||
for (j=0; NULL != addrs[j]; j++) {
|
||||
/* lookup the kernel index of this address */
|
||||
if (0 >= (k = opal_ifaddrtokindex(addrs[j]))) {
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s UNFOUND KERNEL INDEX %d FOR ADDRESS %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, addrs[j]);
|
||||
/* we don't have an interface on this subnet - ignore it */
|
||||
continue;
|
||||
}
|
||||
if (NULL == (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s NO MODULE AT KINDEX %d FOR ADDRESS %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, addrs[j]);
|
||||
continue;
|
||||
/* if they gave us "localhost", then just take our lowest kernel index interface */
|
||||
if (0 == strcasecmp(addrs[j], "localhost")) {
|
||||
n = opal_ifbegin();
|
||||
mod = NULL;
|
||||
while (0 <= n) {
|
||||
k = opal_ifindextokindex(n);
|
||||
if (NULL != (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s USING MODULE AT KINDEX %d FOR LOCALHOST",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k);
|
||||
break;
|
||||
}
|
||||
n = opal_ifnext(n);
|
||||
}
|
||||
if (NULL == mod) {
|
||||
/* should never happen */
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s NO MODULE FOR LOCALHOST",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
/* lookup the kernel index of this address */
|
||||
if (0 >= (k = opal_ifaddrtokindex(addrs[j]))) {
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s UNFOUND KERNEL INDEX %d FOR ADDRESS %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, addrs[j]);
|
||||
/* we don't have an interface on this subnet - ignore it */
|
||||
continue;
|
||||
}
|
||||
if (NULL == (mod = (mca_oob_tcp_module_t*)opal_pointer_array_get_item(&mca_oob_tcp_component.modules, k))) {
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s NO MODULE AT KINDEX %d FOR ADDRESS %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), k, addrs[j]);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
/* record that this peer may be reachable via this module, but don't assign
|
||||
* the peer to this module until later when we actually connect
|
||||
*/
|
||||
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
|
||||
"%s PEER %s MAY BE REACHABLE USING MODULE AT KINDEX %d INTERFACE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(peer), k, mod->if_name);
|
||||
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_component.peers,
|
||||
ui64, (void**)&pr) || NULL == pr) {
|
||||
pr = OBJ_NEW(mca_oob_tcp_component_peer_t);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user