Provide a better error message when the oob cannot send a message after exhausting retries, and then have the proc abort so the job doesn't just hang forever.

Since it could be a daemon that needs to abort, cleanup the abort sequence so the daemon can exit as cleanly as possible. This commit was SVN r21361.
2009-06-02 23:57:12 +00:00 · 2009-06-02 23:57:12 +00:00 · 3815bfbba6
--- a/orte/mca/ess/base/ess_base_std_orted.c
+++ b/orte/mca/ess/base/ess_base_std_orted.c
@ -343,7 +343,13 @@ int orte_ess_base_orted_finalize(void)
    opal_list_item_t *item;
    
    /* ensure all the orteds depart together */
+    if (!orte_abnormal_term_ordered) {
+        /* if we are abnormally terminating, don't attempt
+         * to do a barrier as nobody else will be entering
+         * that call
+         */
        orte_grpcomm.onesided_barrier();
+    }
    
    orte_notifier_base_close();
    
--- a/orte/mca/oob/tcp/oob_tcp_peer.c
+++ b/orte/mca/oob/tcp/oob_tcp_peer.c
@ -55,12 +55,14 @@
 #include "opal/util/output.h"
 #include "opal/util/net.h"
 #include "opal/util/error.h"
-
 #include "opal/class/opal_hash_table.h"
+
 #include "orte/util/name_fns.h"
 #include "orte/runtime/orte_globals.h"
 #include "orte/mca/errmgr/errmgr.h"
 #include "orte/mca/routed/routed.h"
+#include "orte/mca/ess/ess.h"
+#include "orte/runtime/orte_wait.h"

 #include "oob_tcp.h"
 #include "oob_tcp_peer.h"
@ -612,10 +614,13 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
    /* giving up and cleanup any pending messages */
    if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
        mca_oob_tcp_msg_t *msg;
+        char *host;

-        opal_output(0, "%s-%s oob-tcp: Communication retries exceeded.  Can not communicate with peer",
+        host = orte_ess.proc_get_hostname(&(peer->peer_name));
+        opal_output(0, "%s -> %s (node: %s) oob-tcp: Communication retries exceeded.  Can not communicate with peer",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                    ORTE_NAME_PRINT(&(peer->peer_name)));
+                    ORTE_NAME_PRINT(&(peer->peer_name)),
+                    (NULL == host) ? "NULL" : host);

        /* There are cases during the initial connection setup where
           the peer_send_msg is NULL but there are things in the queue
@ -637,6 +642,18 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
           not likely to suddenly become successful, so abort the
           whole thing */
        peer->peer_state = MCA_OOB_TCP_FAILED;
+        
+        /* since we cannot communicate, and the system obviously needed
+         * to do so, let's abort so we don't just hang here
+         */
+        if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
+            /* just wake us up */
+            ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
+            orte_abnormal_term_ordered = true;
+            orte_trigger_event(&orte_exit);
+        } else {
+            orte_errmgr.abort(1, NULL);
+        }
    }

    if (peer->peer_sd >= 0) {
--- a/orte/orted/orted_main.c
+++ b/orte/orted/orted_main.c
@ -715,6 +715,7 @@ static void shutdown_signal(int fd, short flags, void *arg)
     * against race conditions - the trigger event will
     * check the one-time lock
     */
+    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
    orte_trigger_event(&orte_exit);
 }

@ -775,7 +776,7 @@ static void shutdown_callback(int fd, short flags, void *arg)

    /* Finalize and clean up ourselves */
    ret = orte_finalize();
-    exit(ret);
+    exit(orte_exit_status);
 }

 static void signal_callback(int fd, short event, void *arg)