1
1

Provide a better error message when the oob cannot send a message after exhausting retries, and then have the proc abort so the job doesn't just hang forever.

Since it could be a daemon that needs to abort, cleanup the abort sequence so the daemon can exit as cleanly as possible.

This commit was SVN r21361.
Этот коммит содержится в:
Ralph Castain 2009-06-02 23:57:12 +00:00
родитель 882b40182b
Коммит 3815bfbba6
3 изменённых файлов: 29 добавлений и 5 удалений

Просмотреть файл

@ -343,7 +343,13 @@ int orte_ess_base_orted_finalize(void)
opal_list_item_t *item; opal_list_item_t *item;
/* ensure all the orteds depart together */ /* ensure all the orteds depart together */
orte_grpcomm.onesided_barrier(); if (!orte_abnormal_term_ordered) {
/* if we are abnormally terminating, don't attempt
* to do a barrier as nobody else will be entering
* that call
*/
orte_grpcomm.onesided_barrier();
}
orte_notifier_base_close(); orte_notifier_base_close();

Просмотреть файл

@ -55,12 +55,14 @@
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/net.h" #include "opal/util/net.h"
#include "opal/util/error.h" #include "opal/util/error.h"
#include "opal/class/opal_hash_table.h" #include "opal/class/opal_hash_table.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/routed/routed.h" #include "orte/mca/routed/routed.h"
#include "orte/mca/ess/ess.h"
#include "orte/runtime/orte_wait.h"
#include "oob_tcp.h" #include "oob_tcp.h"
#include "oob_tcp_peer.h" #include "oob_tcp_peer.h"
@ -612,10 +614,13 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
/* giving up and cleanup any pending messages */ /* giving up and cleanup any pending messages */
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) { if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
mca_oob_tcp_msg_t *msg; mca_oob_tcp_msg_t *msg;
char *host;
opal_output(0, "%s-%s oob-tcp: Communication retries exceeded. Can not communicate with peer", host = orte_ess.proc_get_hostname(&(peer->peer_name));
opal_output(0, "%s -> %s (node: %s) oob-tcp: Communication retries exceeded. Can not communicate with peer",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&(peer->peer_name))); ORTE_NAME_PRINT(&(peer->peer_name)),
(NULL == host) ? "NULL" : host);
/* There are cases during the initial connection setup where /* There are cases during the initial connection setup where
the peer_send_msg is NULL but there are things in the queue the peer_send_msg is NULL but there are things in the queue
@ -637,6 +642,18 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
not likely to suddenly become successful, so abort the not likely to suddenly become successful, so abort the
whole thing */ whole thing */
peer->peer_state = MCA_OOB_TCP_FAILED; peer->peer_state = MCA_OOB_TCP_FAILED;
/* since we cannot communicate, and the system obviously needed
* to do so, let's abort so we don't just hang here
*/
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
/* just wake us up */
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
orte_abnormal_term_ordered = true;
orte_trigger_event(&orte_exit);
} else {
orte_errmgr.abort(1, NULL);
}
} }
if (peer->peer_sd >= 0) { if (peer->peer_sd >= 0) {

Просмотреть файл

@ -715,6 +715,7 @@ static void shutdown_signal(int fd, short flags, void *arg)
* against race conditions - the trigger event will * against race conditions - the trigger event will
* check the one-time lock * check the one-time lock
*/ */
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
orte_trigger_event(&orte_exit); orte_trigger_event(&orte_exit);
} }
@ -775,7 +776,7 @@ static void shutdown_callback(int fd, short flags, void *arg)
/* Finalize and clean up ourselves */ /* Finalize and clean up ourselves */
ret = orte_finalize(); ret = orte_finalize();
exit(ret); exit(orte_exit_status);
} }
static void signal_callback(int fd, short event, void *arg) static void signal_callback(int fd, short event, void *arg)