orte/errmgr: Improve help message on connection lost
Signed-off-by: Joshua Hursey <jhursey@us.ibm.com>
Этот коммит содержится в:
родитель
578d8819cf
Коммит
c452f68495
@ -11,6 +11,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -61,9 +62,10 @@ route found between them. Please check network connectivity
|
||||
(including firewalls and network routing requirements).
|
||||
#
|
||||
[node-died]
|
||||
ORTE has lost communication with its daemon located on node:
|
||||
ORTE has lost communication with a remote daemon.
|
||||
|
||||
hostname: %s
|
||||
HNP daemon : %s on node %s
|
||||
Remote daemon: %s on node %s
|
||||
|
||||
This is usually due to either a failure of the TCP network
|
||||
connection to the node, or possibly an internal failure of
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -379,7 +380,11 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
/* record the first one to fail */
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* output an error message so the user knows what happened */
|
||||
orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
|
||||
orte_show_help("help-errmgr-base.txt", "node-died", true,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_process_info.nodename,
|
||||
ORTE_NAME_PRINT(proc),
|
||||
pptr->node->name);
|
||||
/* mark the daemon job as failed */
|
||||
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
|
||||
/* point to the lowest rank to cause the problem */
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -381,7 +382,11 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
/* record the first one to fail */
|
||||
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
|
||||
/* output an error message so the user knows what happened */
|
||||
orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
|
||||
orte_show_help("help-errmgr-base.txt", "node-died", true,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orte_process_info.nodename,
|
||||
ORTE_NAME_PRINT(proc),
|
||||
pptr->node->name);
|
||||
/* mark the daemon job as failed */
|
||||
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
|
||||
/* point to the lowest rank to cause the problem */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user