1
1

Try again to get an error message printed when a daemon fails to successfully report back to mpirun. In this case, there is no guaranteed way for the daemon to output the error report itself - we don't have a connection back to the HNP, and we have tied stderr off to /dev/null (for good reasons). So the HNP has to detect the failure itself and report it.

The HNP can't know the precise reason, of course - all it knows is that the daemon failed. So output a generic error message that provides guidance on probable causes.

Refs trac:4571

This commit was SVN r31589.

The following Trac tickets were found above:
  Ticket 4571 --> https://svn.open-mpi.org/trac/ompi/ticket/4571
Этот коммит содержится в:
Ralph Castain 2014-05-01 19:48:21 +00:00
родитель 0fac9781b3
Коммит 445b552d3a
8 изменённых файлов: 58 добавлений и 31 удалений

Просмотреть файл

@ -10,6 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -27,3 +28,34 @@ address is unknown:
On node: %s
The message could not be delivered, and we are aborting.
#
[failed-daemon-launch]
ORTE was unable to reliably start one or more daemons.
This usually is caused by:
* not finding the required libraries and/or binaries on
one or more nodes. Please check your PATH and LD_LIBRARY_PATH
settings, or configure OMPI with --enable-orterun-prefix-by-default
* lack of authority to execute on one or more specified nodes.
Please verify your allocation and authorities.
* the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base).
Please check with your sys admin to determine the correct location to use.
* compilation of the orted with dynamic libraries when static are required
(e.g., on Cray). Please check your configure cmd line and consider using
one of the contrib/platform definitions for your system type.
* an inability to create a connection back to mpirun due to a
lack of common network interfaces and/or no route found between
them. Please check network connectivity (including firewalls
and network routing requirements).
#
[failed-daemon]
An ORTE daemon has unexpectedly failed after launch and before
communicating back to mpirun. This could be caused by a number
of factors, including an inability to create a connection back
to mpirun due to a lack of common network interfaces and/or no
route found between them. Please check network connectivity
(including firewalls and network routing requirements).

Просмотреть файл

@ -208,6 +208,23 @@ static void job_errors(int fd, short args, void *cbdata)
}
}
}
/* if this is the daemon job, then we need to ensure we
* output an error message indicating we couldn't launch the
* daemons */
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
}
}
/* if the daemon job aborted and we haven't heard from everyone yet,
* then this could well have been caused by a daemon not finding
* a way back to us. In this case, output a message indicating a daemon
* died without reporting. Otherwise, say nothing as we
* likely already output an error message */
if (ORTE_JOB_STATE_ABORTED == jobstate &&
jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
jdata->num_procs != jdata->num_reported) {
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
}
/* abort the job */
@ -451,6 +468,11 @@ static void proc_errors(int fd, short args, void *cbdata)
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
}
/* if this was a daemon, report it */
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
/* output a message indicating we failed to launch a daemon */
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
}
/* abnormal termination - abort */
default_hnp_abort(jdata);
break;

Просмотреть файл

@ -64,15 +64,4 @@ value will be ignored.
Local host: %s
Value: %s
Message: %s
#
[unable-to-communicate]
One or more TCP routes were given to a process, but no
communication path could be found:
Node: %s
Source: %s
Peer: %s
This usually is caused by a lack of common network interfaces
and no route found between them.

Просмотреть файл

@ -1487,15 +1487,6 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pop->peer));
/* eventually, we should allow other OOB components a chance to connect
* to the target process. However, for now, we need to ensure we don't
* have a silent failure - so emit a "couldn't connect" message
*/
orte_show_help("help-oob-tcp.txt", "unable-to-communicate", true,
orte_process_info.nodename,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pop->peer));
cleanup:
/* if this was a lifeline, then alert */
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {

Просмотреть файл

@ -503,14 +503,6 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
if (0 != status) {
if (failed_launch) {
/* we have a problem during launch */
opal_output(0, "ERROR: alps failed to start the required daemons.");
opal_output(0, "ERROR: This could be due to an inability to find the orted binary (--prefix)");
opal_output(0, "ERROR: on one or more remote nodes, compilation of the orted with dynamic libraries,");
opal_output(0, "ERROR: lack of authority to execute on one or more specified nodes,");
opal_output(0, "ERROR: or the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base).");
/* report that the daemon has failed so we break out of the daemon
* callback receive and exit
*/
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);

Просмотреть файл

@ -10,6 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -48,4 +49,3 @@ are running.
Please consult with your system administrator about obtaining
such support.

Просмотреть файл

@ -537,6 +537,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:slurm: daemon failed during launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* notify the error manager */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
} else {
/* if this is after launch, then we need to abort only if the status

Просмотреть файл

@ -424,7 +424,7 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* check for failed launch - if so, force terminate */
if (failed_launch) {
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_FAILED_TO_START);
}
}
@ -471,7 +471,7 @@ static void poll_spawns(int fd, short args, void *cbdata)
/* check for failed launch - if so, force terminate */
if (failed_launch) {
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_FAILED_TO_START);
}
}