From 3cee4152fc17ae0f65ae9337a968c37e903ebf5c Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 11 May 2015 09:16:25 -0700 Subject: [PATCH] Fix the intercommunictor issue reported by Gilles. Instead of directly checking the reachability bitmap, ask the component if the proc is reachable when doing a send as the component is the final arbiter in such cases. Recirculate any messages that a daemon is trying to send to void race conditions. Cleanup listener sockets so we don't leak them --- orte/mca/oob/base/oob_base_stubs.c | 11 +++++++++-- orte/mca/oob/tcp/oob_tcp_listener.c | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/orte/mca/oob/base/oob_base_stubs.c b/orte/mca/oob/base/oob_base_stubs.c index 926d723695..df4a69ba0d 100644 --- a/orte/mca/oob/base/oob_base_stubs.c +++ b/orte/mca/oob/base/oob_base_stubs.c @@ -120,6 +120,13 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata) } /* if nobody could reach it, then that's an error */ if (!reachable) { + /* if we are a daemon or HNP, then it could be that + * this is a local proc we just haven't heard from + * yet due to a race condition. Check that situation */ + if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) { + ORTE_OOB_SEND(msg); + return; + } msg->status = ORTE_ERR_ADDRESSEE_UNKNOWN; ORTE_RML_SEND_COMPLETE(msg); return; @@ -150,8 +157,8 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata) msg_sent = false; OPAL_LIST_FOREACH(cli, &orte_oob_base.actives, mca_base_component_list_item_t) { component = (mca_oob_base_component_t*)cli->cli_component; - /* is this peer addressable by this component? */ - if (!opal_bitmap_is_set_bit(&pr->addressable, component->idx)) { + /* is this peer reachable via this component? */ + if (!component->is_reachable(&msg->dst)) { continue; } /* it is addressable, so attempt to send via that transport */ diff --git a/orte/mca/oob/tcp/oob_tcp_listener.c b/orte/mca/oob/tcp/oob_tcp_listener.c index 5e6bb8e300..de95488c32 100644 --- a/orte/mca/oob/tcp/oob_tcp_listener.c +++ b/orte/mca/oob/tcp/oob_tcp_listener.c @@ -876,6 +876,7 @@ static void connection_event_handler(int incoming_sd, short flags, void* cbdata) static void tcp_ev_cons(mca_oob_tcp_listener_t* event) { event->ev_active = false; + event->sd = -1; } static void tcp_ev_des(mca_oob_tcp_listener_t* event) { @@ -883,6 +884,10 @@ static void tcp_ev_des(mca_oob_tcp_listener_t* event) opal_event_del(&event->event); } event->ev_active = false; + if (0 <= event->sd) { + CLOSE_THE_SOCKET(event->sd); + event->sd = -1; + } } OBJ_CLASS_INSTANCE(mca_oob_tcp_listener_t,