Try again to get an error message printed when a daemon fails to successfully report back to mpirun. In this case, there is no guaranteed way for the daemon to output the error report itself - we don't have a connection back to the HNP, and we have tied stderr off to /dev/null (for good reasons). So the HNP has to detect the failure itself and report it.
The HNP can't know the precise reason, of course - all it knows is that the daemon failed. So output a generic error message that provides guidance on probable causes. Refs trac:4571 This commit was SVN r31589. The following Trac tickets were found above: Ticket 4571 --> https://svn.open-mpi.org/trac/ompi/ticket/4571
Этот коммит содержится в:
родитель
0fac9781b3
Коммит
445b552d3a
@ -10,6 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -27,3 +28,34 @@ address is unknown:
|
||||
On node: %s
|
||||
|
||||
The message could not be delivered, and we are aborting.
|
||||
#
|
||||
[failed-daemon-launch]
|
||||
ORTE was unable to reliably start one or more daemons.
|
||||
This usually is caused by:
|
||||
|
||||
* not finding the required libraries and/or binaries on
|
||||
one or more nodes. Please check your PATH and LD_LIBRARY_PATH
|
||||
settings, or configure OMPI with --enable-orterun-prefix-by-default
|
||||
|
||||
* lack of authority to execute on one or more specified nodes.
|
||||
Please verify your allocation and authorities.
|
||||
|
||||
* the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base).
|
||||
Please check with your sys admin to determine the correct location to use.
|
||||
|
||||
* compilation of the orted with dynamic libraries when static are required
|
||||
(e.g., on Cray). Please check your configure cmd line and consider using
|
||||
one of the contrib/platform definitions for your system type.
|
||||
|
||||
* an inability to create a connection back to mpirun due to a
|
||||
lack of common network interfaces and/or no route found between
|
||||
them. Please check network connectivity (including firewalls
|
||||
and network routing requirements).
|
||||
#
|
||||
[failed-daemon]
|
||||
An ORTE daemon has unexpectedly failed after launch and before
|
||||
communicating back to mpirun. This could be caused by a number
|
||||
of factors, including an inability to create a connection back
|
||||
to mpirun due to a lack of common network interfaces and/or no
|
||||
route found between them. Please check network connectivity
|
||||
(including firewalls and network routing requirements).
|
||||
|
@ -208,8 +208,25 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if this is the daemon job, then we need to ensure we
|
||||
* output an error message indicating we couldn't launch the
|
||||
* daemons */
|
||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
||||
}
|
||||
}
|
||||
|
||||
/* if the daemon job aborted and we haven't heard from everyone yet,
|
||||
* then this could well have been caused by a daemon not finding
|
||||
* a way back to us. In this case, output a message indicating a daemon
|
||||
* died without reporting. Otherwise, say nothing as we
|
||||
* likely already output an error message */
|
||||
if (ORTE_JOB_STATE_ABORTED == jobstate &&
|
||||
jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
jdata->num_procs != jdata->num_reported) {
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
|
||||
}
|
||||
|
||||
/* abort the job */
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
|
||||
/* set the global abnormal exit flag */
|
||||
@ -451,6 +468,11 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
|
||||
}
|
||||
/* if this was a daemon, report it */
|
||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* output a message indicating we failed to launch a daemon */
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
||||
}
|
||||
/* abnormal termination - abort */
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
@ -64,15 +64,4 @@ value will be ignored.
|
||||
Local host: %s
|
||||
Value: %s
|
||||
Message: %s
|
||||
#
|
||||
[unable-to-communicate]
|
||||
One or more TCP routes were given to a process, but no
|
||||
communication path could be found:
|
||||
|
||||
Node: %s
|
||||
Source: %s
|
||||
Peer: %s
|
||||
|
||||
This usually is caused by a lack of common network interfaces
|
||||
and no route found between them.
|
||||
|
@ -1487,15 +1487,6 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pop->peer));
|
||||
|
||||
/* eventually, we should allow other OOB components a chance to connect
|
||||
* to the target process. However, for now, we need to ensure we don't
|
||||
* have a silent failure - so emit a "couldn't connect" message
|
||||
*/
|
||||
orte_show_help("help-oob-tcp.txt", "unable-to-communicate", true,
|
||||
orte_process_info.nodename,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pop->peer));
|
||||
|
||||
cleanup:
|
||||
/* if this was a lifeline, then alert */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
|
||||
|
@ -503,14 +503,6 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
|
||||
if (0 != status) {
|
||||
if (failed_launch) {
|
||||
/* we have a problem during launch */
|
||||
opal_output(0, "ERROR: alps failed to start the required daemons.");
|
||||
opal_output(0, "ERROR: This could be due to an inability to find the orted binary (--prefix)");
|
||||
opal_output(0, "ERROR: on one or more remote nodes, compilation of the orted with dynamic libraries,");
|
||||
opal_output(0, "ERROR: lack of authority to execute on one or more specified nodes,");
|
||||
opal_output(0, "ERROR: or the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base).");
|
||||
|
||||
/* report that the daemon has failed so we break out of the daemon
|
||||
* callback receive and exit
|
||||
*/
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
|
@ -10,6 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -48,4 +49,3 @@ are running.
|
||||
|
||||
Please consult with your system administrator about obtaining
|
||||
such support.
|
||||
|
||||
|
@ -537,6 +537,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:slurm: daemon failed during launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* notify the error manager */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
} else {
|
||||
/* if this is after launch, then we need to abort only if the status
|
||||
|
@ -424,7 +424,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
}
|
||||
|
||||
@ -471,7 +471,7 @@ static void poll_spawns(int fd, short args, void *cbdata)
|
||||
|
||||
/* check for failed launch - if so, force terminate */
|
||||
if (failed_launch) {
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user