From 3815bfbba60e1a9e634631eafcab3cba4ca26163 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 2 Jun 2009 23:57:12 +0000 Subject: [PATCH] Provide a better error message when the oob cannot send a message after exhausting retries, and then have the proc abort so the job doesn't just hang forever. Since it could be a daemon that needs to abort, cleanup the abort sequence so the daemon can exit as cleanly as possible. This commit was SVN r21361. --- orte/mca/ess/base/ess_base_std_orted.c | 8 +++++++- orte/mca/oob/tcp/oob_tcp_peer.c | 23 ++++++++++++++++++++--- orte/orted/orted_main.c | 3 ++- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index bd96f68a77..d3b0055469 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -343,7 +343,13 @@ int orte_ess_base_orted_finalize(void) opal_list_item_t *item; /* ensure all the orteds depart together */ - orte_grpcomm.onesided_barrier(); + if (!orte_abnormal_term_ordered) { + /* if we are abnormally terminating, don't attempt + * to do a barrier as nobody else will be entering + * that call + */ + orte_grpcomm.onesided_barrier(); + } orte_notifier_base_close(); diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index 71ffba6586..d137b59afe 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -55,12 +55,14 @@ #include "opal/util/output.h" #include "opal/util/net.h" #include "opal/util/error.h" - #include "opal/class/opal_hash_table.h" + #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/routed/routed.h" +#include "orte/mca/ess/ess.h" +#include "orte/runtime/orte_wait.h" #include "oob_tcp.h" #include "oob_tcp_peer.h" @@ -612,10 +614,13 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) /* giving up and cleanup any pending messages */ if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) { mca_oob_tcp_msg_t *msg; + char *host; - opal_output(0, "%s-%s oob-tcp: Communication retries exceeded. Can not communicate with peer", + host = orte_ess.proc_get_hostname(&(peer->peer_name)); + opal_output(0, "%s -> %s (node: %s) oob-tcp: Communication retries exceeded. Can not communicate with peer", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&(peer->peer_name))); + ORTE_NAME_PRINT(&(peer->peer_name)), + (NULL == host) ? "NULL" : host); /* There are cases during the initial connection setup where the peer_send_msg is NULL but there are things in the queue @@ -637,6 +642,18 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) not likely to suddenly become successful, so abort the whole thing */ peer->peer_state = MCA_OOB_TCP_FAILED; + + /* since we cannot communicate, and the system obviously needed + * to do so, let's abort so we don't just hang here + */ + if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { + /* just wake us up */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + orte_abnormal_term_ordered = true; + orte_trigger_event(&orte_exit); + } else { + orte_errmgr.abort(1, NULL); + } } if (peer->peer_sd >= 0) { diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index b6b82bb634..c9cf44d84b 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -715,6 +715,7 @@ static void shutdown_signal(int fd, short flags, void *arg) * against race conditions - the trigger event will * check the one-time lock */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); orte_trigger_event(&orte_exit); } @@ -775,7 +776,7 @@ static void shutdown_callback(int fd, short flags, void *arg) /* Finalize and clean up ourselves */ ret = orte_finalize(); - exit(ret); + exit(orte_exit_status); } static void signal_callback(int fd, short event, void *arg)