Try again to get an error message printed when a daemon fails to successfully report back to mpirun. In this case, there is no guaranteed way for the daemon to output the error report itself - we don't have a connection back to the HNP, and we have tied stderr off to /dev/null (for good reasons). So the HNP has to detect the failure itself and report it.
The HNP can't know the precise reason, of course - all it knows is that the daemon failed. So output a generic error message that provides guidance on probable causes. Refs trac:4571 This commit was SVN r31589. The following Trac tickets were found above: Ticket 4571 --> https://svn.open-mpi.org/trac/ompi/ticket/4571
Этот коммит содержится в:
родитель
0fac9781b3
Коммит
445b552d3a
@ -10,6 +10,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -27,3 +28,34 @@ address is unknown:
|
|||||||
On node: %s
|
On node: %s
|
||||||
|
|
||||||
The message could not be delivered, and we are aborting.
|
The message could not be delivered, and we are aborting.
|
||||||
|
#
|
||||||
|
[failed-daemon-launch]
|
||||||
|
ORTE was unable to reliably start one or more daemons.
|
||||||
|
This usually is caused by:
|
||||||
|
|
||||||
|
* not finding the required libraries and/or binaries on
|
||||||
|
one or more nodes. Please check your PATH and LD_LIBRARY_PATH
|
||||||
|
settings, or configure OMPI with --enable-orterun-prefix-by-default
|
||||||
|
|
||||||
|
* lack of authority to execute on one or more specified nodes.
|
||||||
|
Please verify your allocation and authorities.
|
||||||
|
|
||||||
|
* the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base).
|
||||||
|
Please check with your sys admin to determine the correct location to use.
|
||||||
|
|
||||||
|
* compilation of the orted with dynamic libraries when static are required
|
||||||
|
(e.g., on Cray). Please check your configure cmd line and consider using
|
||||||
|
one of the contrib/platform definitions for your system type.
|
||||||
|
|
||||||
|
* an inability to create a connection back to mpirun due to a
|
||||||
|
lack of common network interfaces and/or no route found between
|
||||||
|
them. Please check network connectivity (including firewalls
|
||||||
|
and network routing requirements).
|
||||||
|
#
|
||||||
|
[failed-daemon]
|
||||||
|
An ORTE daemon has unexpectedly failed after launch and before
|
||||||
|
communicating back to mpirun. This could be caused by a number
|
||||||
|
of factors, including an inability to create a connection back
|
||||||
|
to mpirun due to a lack of common network interfaces and/or no
|
||||||
|
route found between them. Please check network connectivity
|
||||||
|
(including firewalls and network routing requirements).
|
||||||
|
@ -208,8 +208,25 @@ static void job_errors(int fd, short args, void *cbdata)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* if this is the daemon job, then we need to ensure we
|
||||||
|
* output an error message indicating we couldn't launch the
|
||||||
|
* daemons */
|
||||||
|
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||||
|
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* if the daemon job aborted and we haven't heard from everyone yet,
|
||||||
|
* then this could well have been caused by a daemon not finding
|
||||||
|
* a way back to us. In this case, output a message indicating a daemon
|
||||||
|
* died without reporting. Otherwise, say nothing as we
|
||||||
|
* likely already output an error message */
|
||||||
|
if (ORTE_JOB_STATE_ABORTED == jobstate &&
|
||||||
|
jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||||
|
jdata->num_procs != jdata->num_reported) {
|
||||||
|
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
|
||||||
|
}
|
||||||
|
|
||||||
/* abort the job */
|
/* abort the job */
|
||||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
|
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
|
||||||
/* set the global abnormal exit flag */
|
/* set the global abnormal exit flag */
|
||||||
@ -451,6 +468,11 @@ static void proc_errors(int fd, short args, void *cbdata)
|
|||||||
jdata->abort = true;
|
jdata->abort = true;
|
||||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||||
}
|
}
|
||||||
|
/* if this was a daemon, report it */
|
||||||
|
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||||
|
/* output a message indicating we failed to launch a daemon */
|
||||||
|
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
||||||
|
}
|
||||||
/* abnormal termination - abort */
|
/* abnormal termination - abort */
|
||||||
default_hnp_abort(jdata);
|
default_hnp_abort(jdata);
|
||||||
break;
|
break;
|
||||||
|
@ -64,15 +64,4 @@ value will be ignored.
|
|||||||
Local host: %s
|
Local host: %s
|
||||||
Value: %s
|
Value: %s
|
||||||
Message: %s
|
Message: %s
|
||||||
#
|
|
||||||
[unable-to-communicate]
|
|
||||||
One or more TCP routes were given to a process, but no
|
|
||||||
communication path could be found:
|
|
||||||
|
|
||||||
Node: %s
|
|
||||||
Source: %s
|
|
||||||
Peer: %s
|
|
||||||
|
|
||||||
This usually is caused by a lack of common network interfaces
|
|
||||||
and no route found between them.
|
|
||||||
|
|
@ -1487,15 +1487,6 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&pop->peer));
|
ORTE_NAME_PRINT(&pop->peer));
|
||||||
|
|
||||||
/* eventually, we should allow other OOB components a chance to connect
|
|
||||||
* to the target process. However, for now, we need to ensure we don't
|
|
||||||
* have a silent failure - so emit a "couldn't connect" message
|
|
||||||
*/
|
|
||||||
orte_show_help("help-oob-tcp.txt", "unable-to-communicate", true,
|
|
||||||
orte_process_info.nodename,
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
|
||||||
ORTE_NAME_PRINT(&pop->peer));
|
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
/* if this was a lifeline, then alert */
|
/* if this was a lifeline, then alert */
|
||||||
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
|
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
|
||||||
|
@ -503,14 +503,6 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
|||||||
|
|
||||||
if (0 != status) {
|
if (0 != status) {
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
/* we have a problem during launch */
|
|
||||||
opal_output(0, "ERROR: alps failed to start the required daemons.");
|
|
||||||
opal_output(0, "ERROR: This could be due to an inability to find the orted binary (--prefix)");
|
|
||||||
opal_output(0, "ERROR: on one or more remote nodes, compilation of the orted with dynamic libraries,");
|
|
||||||
opal_output(0, "ERROR: lack of authority to execute on one or more specified nodes,");
|
|
||||||
opal_output(0, "ERROR: or the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base).");
|
|
||||||
|
|
||||||
/* report that the daemon has failed so we break out of the daemon
|
|
||||||
* callback receive and exit
|
* callback receive and exit
|
||||||
*/
|
*/
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -48,4 +49,3 @@ are running.
|
|||||||
|
|
||||||
Please consult with your system administrator about obtaining
|
Please consult with your system administrator about obtaining
|
||||||
such support.
|
such support.
|
||||||
|
|
||||||
|
@ -537,6 +537,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
|||||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||||
"%s plm:slurm: daemon failed during launch",
|
"%s plm:slurm: daemon failed during launch",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
/* notify the error manager */
|
||||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||||
} else {
|
} else {
|
||||||
/* if this is after launch, then we need to abort only if the status
|
/* if this is after launch, then we need to abort only if the status
|
||||||
|
@ -424,7 +424,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
|
|
||||||
/* check for failed launch - if so, force terminate */
|
/* check for failed launch - if so, force terminate */
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_FAILED_TO_START);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -471,7 +471,7 @@ static void poll_spawns(int fd, short args, void *cbdata)
|
|||||||
|
|
||||||
/* check for failed launch - if so, force terminate */
|
/* check for failed launch - if so, force terminate */
|
||||||
if (failed_launch) {
|
if (failed_launch) {
|
||||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user