Try again to get an error message printed when a daemon fails to successfully report back to mpirun. In this case, there is no guaranteed way for the daemon to output the error report itself - we don't have a connection back to the HNP, and we have tied stderr off to /dev/null (for good reasons). So the HNP has to detect the failure itself and report it.

The HNP can't know the precise reason, of course - all it knows is that the daemon failed. So output a generic error message that provides guidance on probable causes. Refs trac:4571 This commit was SVN r31589. The following Trac tickets were found above: Ticket 4571 --> https://svn.open-mpi.org/trac/ompi/ticket/4571
2014-05-01 19:48:21 +00:00 · 2014-05-01 19:48:21 +00:00 · 445b552d3a
--- a/orte/mca/errmgr/base/help-errmgr-base.txt
+++ b/orte/mca/errmgr/base/help-errmgr-base.txt
@ -10,6 +10,7 @@
 #                         University of Stuttgart.  All rights reserved.
 # Copyright (c) 2004-2005 The Regents of the University of California.
 #                         All rights reserved.
+# Copyright (c) 2014      Intel, Inc. All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
@ -27,3 +28,34 @@ address is unknown:
  On node:    %s

 The message could not be delivered, and we are aborting.
+#
+[failed-daemon-launch]
+ORTE was unable to reliably start one or more daemons.
+This usually is caused by:
+
+* not finding the required libraries and/or binaries on
+  one or more nodes. Please check your PATH and LD_LIBRARY_PATH
+  settings, or configure OMPI with --enable-orterun-prefix-by-default
+
+* lack of authority to execute on one or more specified nodes.
+  Please verify your allocation and authorities.
+
+* the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base).
+  Please check with your sys admin to determine the correct location to use.
+
+*  compilation of the orted with dynamic libraries when static are required
+  (e.g., on Cray). Please check your configure cmd line and consider using
+  one of the contrib/platform definitions for your system type.
+
+* an inability to create a connection back to mpirun due to a
+  lack of common network interfaces and/or no route found between
+  them. Please check network connectivity (including firewalls
+  and network routing requirements).
+#
+[failed-daemon]
+An ORTE daemon has unexpectedly failed after launch and before
+communicating back to mpirun. This could be caused by a number
+of factors, including an inability to create a connection back
+to mpirun due to a lack of common network interfaces and/or no
+route found between them. Please check network connectivity
+(including firewalls and network routing requirements).
--- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
+++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
@ -208,8 +208,25 @@ static void job_errors(int fd, short args, void *cbdata)
                }
            }
        }
+        /* if this is the daemon job, then we need to ensure we
+         * output an error message indicating we couldn't launch the
+         * daemons */
+        if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
+            orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
+        }
    }

+    /* if the daemon job aborted and we haven't heard from everyone yet,
+     * then this could well have been caused by a daemon not finding
+     * a way back to us. In this case, output a message indicating a daemon
+     * died without reporting. Otherwise, say nothing as we
+     * likely already output an error message */
+    if (ORTE_JOB_STATE_ABORTED == jobstate &&
+        jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
+        jdata->num_procs != jdata->num_reported) {
+        orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
+    }
+        
    /* abort the job */
    ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
    /* set the global abnormal exit flag  */
@ -451,6 +468,11 @@ static void proc_errors(int fd, short args, void *cbdata)
            jdata->abort = true;
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        }
+        /* if this was a daemon, report it */
+        if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
+            /* output a message indicating we failed to launch a daemon */
+            orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
+        }
        /* abnormal termination - abort */
        default_hnp_abort(jdata);
        break;
--- a/orte/mca/oob/tcp/help-oob-tcp.txt
+++ b/orte/mca/oob/tcp/help-oob-tcp.txt
@ -64,15 +64,4 @@ value will be ignored.
  Local host: %s
  Value:      %s
  Message:    %s
-#
-[unable-to-communicate]
-One or more TCP routes were given to a process, but no
-communication path could be found:
-
-  Node:   %s
-  Source: %s
-  Peer:   %s
-
-This usually is caused by a lack of common network interfaces
-and no route found between them.
  
--- a/orte/mca/oob/tcp/oob_tcp_component.c
+++ b/orte/mca/oob/tcp/oob_tcp_component.c
@ -1487,15 +1487,6 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&pop->peer));

-    /* eventually, we should allow other OOB components a chance to connect
-     * to the target process. However, for now, we need to ensure we don't
-     * have a silent failure - so emit a "couldn't connect" message
-     */
-    orte_show_help("help-oob-tcp.txt", "unable-to-communicate", true,
-                   orte_process_info.nodename,
-                   ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                   ORTE_NAME_PRINT(&pop->peer));
-
 cleanup:
    /* if this was a lifeline, then alert */
    if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
--- a/orte/mca/plm/alps/plm_alps_module.c
+++ b/orte/mca/plm/alps/plm_alps_module.c
@ -503,14 +503,6 @@ static void alps_wait_cb(pid_t pid, int status, void* cbdata){
    
    if (0 != status) {
        if (failed_launch) {
-            /* we have a problem during launch */
-            opal_output(0, "ERROR: alps failed to start the required daemons.");
-            opal_output(0, "ERROR: This could be due to an inability to find the orted binary (--prefix)");
-            opal_output(0, "ERROR: on one or more remote nodes, compilation of the orted with dynamic libraries,");
-            opal_output(0, "ERROR: lack of authority to execute on one or more specified nodes,");
-            opal_output(0, "ERROR: or the inability to write startup files into /tmp (--tmpdir/orte_tmpdir_base).");
-            
-            /* report that the daemon has failed so we break out of the daemon
             * callback receive and exit
             */
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);            
--- a/orte/mca/plm/slurm/help-plm-slurm.txt
+++ b/orte/mca/plm/slurm/help-plm-slurm.txt
@ -10,6 +10,7 @@
 #                         University of Stuttgart.  All rights reserved.
 # Copyright (c) 2004-2005 The Regents of the University of California.
 #                         All rights reserved.
+# Copyright (c) 2014      Intel, Inc. All rights reserved.
 # $COPYRIGHT$
 # 
 # Additional copyrights may follow
@ -48,4 +49,3 @@ are running.

 Please consult with your system administrator about obtaining
 such support.
-
--- a/orte/mca/plm/slurm/plm_slurm_module.c
+++ b/orte/mca/plm/slurm/plm_slurm_module.c
@ -537,6 +537,7 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:slurm: daemon failed during launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+        /* notify the error manager */
        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
    } else {
        /* if this is after launch, then we need to abort only if the status
--- a/orte/mca/plm/tm/plm_tm_module.c
+++ b/orte/mca/plm/tm/plm_tm_module.c
@ -424,7 +424,7 @@ static void launch_daemons(int fd, short args, void *cbdata)

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
-        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
+        ORTE_ACTIVATE_JOB_STATE(daemons, ORTE_JOB_STATE_FAILED_TO_START);
    }
 }

@ -471,7 +471,7 @@ static void poll_spawns(int fd, short args, void *cbdata)

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
-        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
+        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_FAILED_TO_START);
    }
 }