From 3815bfbba60e1a9e634631eafcab3cba4ca26163 Mon Sep 17 00:00:00 2001
From: Ralph Castain <rhc@open-mpi.org>
Date: Tue, 2 Jun 2009 23:57:12 +0000
Subject: [PATCH] Provide a better error message when the oob cannot send a
 message after exhausting retries, and then have the proc abort so the job
 doesn't just hang forever.

Since it could be a daemon that needs to abort, cleanup the abort sequence so the daemon can exit as cleanly as possible.

This commit was SVN r21361.
---
 orte/mca/ess/base/ess_base_std_orted.c |  8 +++++++-
 orte/mca/oob/tcp/oob_tcp_peer.c        | 23 ++++++++++++++++++++---
 orte/orted/orted_main.c                |  3 ++-
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c
index bd96f68a77..d3b0055469 100644
--- a/orte/mca/ess/base/ess_base_std_orted.c
+++ b/orte/mca/ess/base/ess_base_std_orted.c
@@ -343,7 +343,13 @@ int orte_ess_base_orted_finalize(void)
     opal_list_item_t *item;
     
     /* ensure all the orteds depart together */
-    orte_grpcomm.onesided_barrier();
+    if (!orte_abnormal_term_ordered) {
+        /* if we are abnormally terminating, don't attempt
+         * to do a barrier as nobody else will be entering
+         * that call
+         */
+        orte_grpcomm.onesided_barrier();
+    }
     
     orte_notifier_base_close();
     
diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c
index 71ffba6586..d137b59afe 100644
--- a/orte/mca/oob/tcp/oob_tcp_peer.c
+++ b/orte/mca/oob/tcp/oob_tcp_peer.c
@@ -55,12 +55,14 @@
 #include "opal/util/output.h"
 #include "opal/util/net.h"
 #include "opal/util/error.h"
-
 #include "opal/class/opal_hash_table.h"
+
 #include "orte/util/name_fns.h"
 #include "orte/runtime/orte_globals.h"
 #include "orte/mca/errmgr/errmgr.h"
 #include "orte/mca/routed/routed.h"
+#include "orte/mca/ess/ess.h"
+#include "orte/runtime/orte_wait.h"
 
 #include "oob_tcp.h"
 #include "oob_tcp_peer.h"
@@ -612,10 +614,13 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
     /* giving up and cleanup any pending messages */
     if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
         mca_oob_tcp_msg_t *msg;
+        char *host;
 
-        opal_output(0, "%s-%s oob-tcp: Communication retries exceeded.  Can not communicate with peer",
+        host = orte_ess.proc_get_hostname(&(peer->peer_name));
+        opal_output(0, "%s -> %s (node: %s) oob-tcp: Communication retries exceeded.  Can not communicate with peer",
                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                    ORTE_NAME_PRINT(&(peer->peer_name)));
+                    ORTE_NAME_PRINT(&(peer->peer_name)),
+                    (NULL == host) ? "NULL" : host);
 
         /* There are cases during the initial connection setup where
            the peer_send_msg is NULL but there are things in the queue
@@ -637,6 +642,18 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
            not likely to suddenly become successful, so abort the
            whole thing */
         peer->peer_state = MCA_OOB_TCP_FAILED;
+        
+        /* since we cannot communicate, and the system obviously needed
+         * to do so, let's abort so we don't just hang here
+         */
+        if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
+            /* just wake us up */
+            ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
+            orte_abnormal_term_ordered = true;
+            orte_trigger_event(&orte_exit);
+        } else {
+            orte_errmgr.abort(1, NULL);
+        }
     }
 
     if (peer->peer_sd >= 0) {
diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c
index b6b82bb634..c9cf44d84b 100644
--- a/orte/orted/orted_main.c
+++ b/orte/orted/orted_main.c
@@ -715,6 +715,7 @@ static void shutdown_signal(int fd, short flags, void *arg)
      * against race conditions - the trigger event will
      * check the one-time lock
      */
+    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
     orte_trigger_event(&orte_exit);
 }
 
@@ -775,7 +776,7 @@ static void shutdown_callback(int fd, short flags, void *arg)
 
     /* Finalize and clean up ourselves */
     ret = orte_finalize();
-    exit(ret);
+    exit(orte_exit_status);
 }
 
 static void signal_callback(int fd, short event, void *arg)