From 445b552d3a9ca1eeea57da65e240b11dfdead9d9 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 1 May 2014 19:48:21 +0000 Subject: [PATCH] Try again to get an error message printed when a daemon fails to successfully report back to mpirun. In this case, there is no guaranteed way for the daemon to output the error report itself - we don't have a connection back to the HNP, and we have tied stderr off to /dev/null (for good reasons). So the HNP has to detect the failure itself and report it. The HNP can't know the precise reason, of course - all it knows is that the daemon failed. So output a generic error message that provides guidance on probable causes. Refs trac:4571 This commit was SVN r31589. The following Trac tickets were found above: Ticket 4571 --> https://svn.open-mpi.org/trac/ompi/ticket/4571 --- orte/mca/errmgr/base/help-errmgr-base.txt | 32 +++++++++++++++++++ .../errmgr/default_hnp/errmgr_default_hnp.c | 22 +++++++++++++ orte/mca/oob/tcp/help-oob-tcp.txt | 11 ------- orte/mca/oob/tcp/oob_tcp_component.c | 9 ------ orte/mca/plm/alps/plm_alps_module.c | 8 ----- orte/mca/plm/slurm/help-plm-slurm.txt | 2 +- orte/mca/plm/slurm/plm_slurm_module.c | 1 + orte/mca/plm/tm/plm_tm_module.c | 4 +-- 8 files changed, 58 insertions(+), 31 deletions(-) diff --git a/orte/mca/errmgr/base/help-errmgr-base.txt b/orte/mca/errmgr/base/help-errmgr-base.txt index 375136ea86..bdf0c40b9d 100644 --- a/orte/mca/errmgr/base/help-errmgr-base.txt +++ b/orte/mca/errmgr/base/help-errmgr-base.txt @@ -10,6 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2014 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -27,3 +28,34 @@ address is unknown: On node: %s The message could not be delivered, and we are aborting. +# +[failed-daemon-launch] +ORTE was unable to reliably start one or more daemons. +This usually is caused by: + +* not finding the required libraries and/or binaries on + one or more nodes. Please check your PATH and LD_LIBRARY_PATH + settings, or configure OMPI with --enable-orterun-prefix-by-default + +* lack of authority to execute on one or more specified nodes. + Please verify your allocation and authorities. + +* the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base). + Please check with your sys admin to determine the correct location to use. + +* compilation of the orted with dynamic libraries when static are required + (e.g., on Cray). Please check your configure cmd line and consider using + one of the contrib/platform definitions for your system type. + +* an inability to create a connection back to mpirun due to a + lack of common network interfaces and/or no route found between + them. Please check network connectivity (including firewalls + and network routing requirements). +# +[failed-daemon] +An ORTE daemon has unexpectedly failed after launch and before +communicating back to mpirun. This could be caused by a number +of factors, including an inability to create a connection back +to mpirun due to a lack of common network interfaces and/or no +route found between them. Please check network connectivity +(including firewalls and network routing requirements). diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index ee3d6a30a6..06243e9719 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -208,8 +208,25 @@ static void job_errors(int fd, short args, void *cbdata) } } } + /* if this is the daemon job, then we need to ensure we + * output an error message indicating we couldn't launch the + * daemons */ + if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { + orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true); + } } + /* if the daemon job aborted and we haven't heard from everyone yet, + * then this could well have been caused by a daemon not finding + * a way back to us. In this case, output a message indicating a daemon + * died without reporting. Otherwise, say nothing as we + * likely already output an error message */ + if (ORTE_JOB_STATE_ABORTED == jobstate && + jdata->jobid == ORTE_PROC_MY_NAME->jobid && + jdata->num_procs != jdata->num_reported) { + orte_show_help("help-errmgr-base.txt", "failed-daemon", true); + } + /* abort the job */ ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT); /* set the global abnormal exit flag */ @@ -451,6 +468,11 @@ static void proc_errors(int fd, short args, void *cbdata) jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); } + /* if this was a daemon, report it */ + if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { + /* output a message indicating we failed to launch a daemon */ + orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true); + } /* abnormal termination - abort */ default_hnp_abort(jdata); break; diff --git a/orte/mca/oob/tcp/help-oob-tcp.txt b/orte/mca/oob/tcp/help-oob-tcp.txt index d9d7864587..aa58608f63 100644 --- a/orte/mca/oob/tcp/help-oob-tcp.txt +++ b/orte/mca/oob/tcp/help-oob-tcp.txt @@ -64,15 +64,4 @@ value will be ignored. Local host: %s Value: %s Message: %s -# -[unable-to-communicate] -One or more TCP routes were given to a process, but no -communication path could be found: - - Node: %s - Source: %s - Peer: %s - -This usually is caused by a lack of common network interfaces -and no route found between them. \ No newline at end of file diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 3a2bba7631..e413fc181b 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -1487,15 +1487,6 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pop->peer)); - /* eventually, we should allow other OOB components a chance to connect - * to the target process. However, for now, we need to ensure we don't - * have a silent failure - so emit a "couldn't connect" message - */ - orte_show_help("help-oob-tcp.txt", "unable-to-communicate", true, - orte_process_info.nodename, - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pop->peer)); - cleanup: /* if this was a lifeline, then alert */ if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) { diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index eeec9ac4cd..894df2b96f 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -503,14 +503,6 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){ if (0 != status) { if (failed_launch) { - /* we have a problem during launch */ - opal_output(0, "ERROR: alps failed to start the required daemons."); - opal_output(0, "ERROR: This could be due to an inability to find the orted binary (--prefix)"); - opal_output(0, "ERROR: on one or more remote nodes, compilation of the orted with dynamic libraries,"); - opal_output(0, "ERROR: lack of authority to execute on one or more specified nodes,"); - opal_output(0, "ERROR: or the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base)."); - - /* report that the daemon has failed so we break out of the daemon * callback receive and exit */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START); diff --git a/orte/mca/plm/slurm/help-plm-slurm.txt b/orte/mca/plm/slurm/help-plm-slurm.txt index fc9f1afde7..369b339c99 100644 --- a/orte/mca/plm/slurm/help-plm-slurm.txt +++ b/orte/mca/plm/slurm/help-plm-slurm.txt @@ -10,6 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2014 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -48,4 +49,3 @@ are running. Please consult with your system administrator about obtaining such support. - diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index c6a1688de0..c9290a55cf 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -537,6 +537,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){ OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: daemon failed during launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* notify the error manager */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START); } else { /* if this is after launch, then we need to abort only if the status diff --git a/orte/mca/plm/tm/plm_tm_module.c b/orte/mca/plm/tm/plm_tm_module.c index d849d33427..ab4ef0829e 100644 --- a/orte/mca/plm/tm/plm_tm_module.c +++ b/orte/mca/plm/tm/plm_tm_module.c @@ -424,7 +424,7 @@ static void launch_daemons(int fd, short args, void *cbdata) /* check for failed launch - if so, force terminate */ if (failed_launch) { - ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_FAILED_TO_START); } } @@ -471,7 +471,7 @@ static void poll_spawns(int fd, short args, void *cbdata) /* check for failed launch - if so, force terminate */ if (failed_launch) { - ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); + ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_FAILED_TO_START); } }