Provide a better error message when the oob cannot send a message after exhausting retries, and then have the proc abort so the job doesn't just hang forever.
Since it could be a daemon that needs to abort, cleanup the abort sequence so the daemon can exit as cleanly as possible. This commit was SVN r21361.
Этот коммит содержится в:
родитель
882b40182b
Коммит
3815bfbba6
@ -343,7 +343,13 @@ int orte_ess_base_orted_finalize(void)
|
|||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
|
|
||||||
/* ensure all the orteds depart together */
|
/* ensure all the orteds depart together */
|
||||||
orte_grpcomm.onesided_barrier();
|
if (!orte_abnormal_term_ordered) {
|
||||||
|
/* if we are abnormally terminating, don't attempt
|
||||||
|
* to do a barrier as nobody else will be entering
|
||||||
|
* that call
|
||||||
|
*/
|
||||||
|
orte_grpcomm.onesided_barrier();
|
||||||
|
}
|
||||||
|
|
||||||
orte_notifier_base_close();
|
orte_notifier_base_close();
|
||||||
|
|
||||||
|
@ -55,12 +55,14 @@
|
|||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/util/net.h"
|
#include "opal/util/net.h"
|
||||||
#include "opal/util/error.h"
|
#include "opal/util/error.h"
|
||||||
|
|
||||||
#include "opal/class/opal_hash_table.h"
|
#include "opal/class/opal_hash_table.h"
|
||||||
|
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/routed/routed.h"
|
#include "orte/mca/routed/routed.h"
|
||||||
|
#include "orte/mca/ess/ess.h"
|
||||||
|
#include "orte/runtime/orte_wait.h"
|
||||||
|
|
||||||
#include "oob_tcp.h"
|
#include "oob_tcp.h"
|
||||||
#include "oob_tcp_peer.h"
|
#include "oob_tcp_peer.h"
|
||||||
@ -612,10 +614,13 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
|||||||
/* giving up and cleanup any pending messages */
|
/* giving up and cleanup any pending messages */
|
||||||
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
|
if(peer->peer_retries++ > mca_oob_tcp_component.tcp_peer_retries) {
|
||||||
mca_oob_tcp_msg_t *msg;
|
mca_oob_tcp_msg_t *msg;
|
||||||
|
char *host;
|
||||||
|
|
||||||
opal_output(0, "%s-%s oob-tcp: Communication retries exceeded. Can not communicate with peer",
|
host = orte_ess.proc_get_hostname(&(peer->peer_name));
|
||||||
|
opal_output(0, "%s -> %s (node: %s) oob-tcp: Communication retries exceeded. Can not communicate with peer",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&(peer->peer_name)));
|
ORTE_NAME_PRINT(&(peer->peer_name)),
|
||||||
|
(NULL == host) ? "NULL" : host);
|
||||||
|
|
||||||
/* There are cases during the initial connection setup where
|
/* There are cases during the initial connection setup where
|
||||||
the peer_send_msg is NULL but there are things in the queue
|
the peer_send_msg is NULL but there are things in the queue
|
||||||
@ -637,6 +642,18 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer)
|
|||||||
not likely to suddenly become successful, so abort the
|
not likely to suddenly become successful, so abort the
|
||||||
whole thing */
|
whole thing */
|
||||||
peer->peer_state = MCA_OOB_TCP_FAILED;
|
peer->peer_state = MCA_OOB_TCP_FAILED;
|
||||||
|
|
||||||
|
/* since we cannot communicate, and the system obviously needed
|
||||||
|
* to do so, let's abort so we don't just hang here
|
||||||
|
*/
|
||||||
|
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
|
||||||
|
/* just wake us up */
|
||||||
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||||
|
orte_abnormal_term_ordered = true;
|
||||||
|
orte_trigger_event(&orte_exit);
|
||||||
|
} else {
|
||||||
|
orte_errmgr.abort(1, NULL);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (peer->peer_sd >= 0) {
|
if (peer->peer_sd >= 0) {
|
||||||
|
@ -715,6 +715,7 @@ static void shutdown_signal(int fd, short flags, void *arg)
|
|||||||
* against race conditions - the trigger event will
|
* against race conditions - the trigger event will
|
||||||
* check the one-time lock
|
* check the one-time lock
|
||||||
*/
|
*/
|
||||||
|
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||||
orte_trigger_event(&orte_exit);
|
orte_trigger_event(&orte_exit);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -775,7 +776,7 @@ static void shutdown_callback(int fd, short flags, void *arg)
|
|||||||
|
|
||||||
/* Finalize and clean up ourselves */
|
/* Finalize and clean up ourselves */
|
||||||
ret = orte_finalize();
|
ret = orte_finalize();
|
||||||
exit(ret);
|
exit(orte_exit_status);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void signal_callback(int fd, short event, void *arg)
|
static void signal_callback(int fd, short event, void *arg)
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user