Cleanly handle the failed start of an orted, or its unexpected failure after start. This commit will allow mpirun to exit cleanly when this occurs, and does a best-effort attempt to cleanup the mess. However, it still has two unresolved issues that need to be eventually addressed:

1. it depends upon the ability of the native environment to alert us that the orted has died/failed to start. I have included that support for SLURM, but other environments need to be done. 2. for some yet-to-be-determined reason, the message that tells the remaining daemons to "die" isn't getting out of the RML, even though no obvious blockage is standing in the way. Work will continue on resolving that problem. For now, the orteds appear to be exiting on their own quite nicely when they see their HNP "lifeline" disappear. This represents the best-available fix for ticket #221 so I am closing that ticket at this time. This commit was SVN r18536.
2008-05-29 13:38:27 +00:00 · 2008-05-29 13:38:27 +00:00 · 72530f8fed
--- a/orte/mca/plm/base/help-plm-base.txt
+++ b/orte/mca/plm/base/help-plm-base.txt
@ -24,7 +24,7 @@ any mechanism to launch proceses, and therefore is unable to start the
 process(es) required by your application.
 #
 [daemon-died-no-signal]
-A daemon (pid %d) died unexpectedly with status %d while attempting
+A daemon (pid %s) died unexpectedly with status %d while attempting
 to launch so we are aborting.

 There may be more information reported by the environment (see above).
@ -35,7 +35,7 @@ location of the shared libraries on the remote nodes and this will
 automatically be forwarded to the remote nodes.
 #
 [daemon-died-signal-core]
-A daemon (pid %d) died unexpectedly on signal %d (with core) while
+A daemon (pid %s) died unexpectedly on signal %d (with core) while
 attempting to launch so we are aborting.

 There may be more information reported by the environment (see above).
@ -46,7 +46,7 @@ location of the shared libraries on the remote nodes and this will
 automatically be forwarded to the remote nodes.
 #
 [daemon-died-signal]
-A daemon (pid %d) died unexpectedly on signal %d  while attempting to
+A daemon (pid %s) died unexpectedly on signal %d  while attempting to
 launch so we are aborting.

 There may be more information reported by the environment (see above).
--- a/orte/mca/plm/base/plm_base_launch_support.c
+++ b/orte/mca/plm/base/plm_base_launch_support.c
@ -227,6 +227,7 @@ void orte_plm_base_launch_failed(orte_jobid_t job, bool daemons_launching, pid_t
                                 int status, orte_job_state_t state)
 {
    orte_job_t *jdata;
+    char *pidstr;
    
    ORTE_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:launch_failed for job %s during %s",
@ -234,23 +235,32 @@ void orte_plm_base_launch_failed(orte_jobid_t job, bool daemons_launching, pid_t
                         ORTE_JOBID_PRINT(job),
                         (daemons_launching) ? "daemon launch" : "app launch"));

+    if (0 < pid) {
+        asprintf(&pidstr, "%d", (int)pid);
+    } else {
+        /* if the pid is negative, then we couldn't get a real pid
+         * to report here - so tell someone that
+         */
+        pidstr = strdup("NO PID");
+    }
+
    if (daemons_launching) {
        if (WIFSIGNALED(status)) { /* died on signal */
 #ifdef WCOREDUMP
            if (WCOREDUMP(status)) {
                orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
-                               pid, WTERMSIG(status));
+                               pidstr, WTERMSIG(status));
            } else {
                orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
-                               pid, WTERMSIG(status));
+                               pidstr, WTERMSIG(status));
            }
 #else
            orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
-                           pid, WTERMSIG(status));
+                           pidstr, WTERMSIG(status));
 #endif /* WCOREDUMP */
        } else {
            orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
-                           pid, WEXITSTATUS(status));
+                           pidstr, WEXITSTATUS(status));
        }
        orted_failed_launch = true;
        /* set the flag indicating that a daemon failed so we use the proper
@ -259,6 +269,7 @@ void orte_plm_base_launch_failed(orte_jobid_t job, bool daemons_launching, pid_t
        orte_abnormal_term_ordered = true;
        
    }
+    free(pidstr);
    
    /* Set the job state as indicated so orterun's exit status
       will be non-zero
@ -683,6 +694,12 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
    if (orted_spin_flag) {
        opal_argv_append(argc, argv, "--spin");
    }
+    if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
+        opal_argv_append(argc, argv, "--debug-failure");
+        asprintf(&param, "%d", orted_debug_failure);
+        opal_argv_append(argc, argv, param);
+        free(param);
+    }
    
    /* tell the orted what SDS component to use */
    opal_argv_append(argc, argv, "-mca");
--- a/orte/mca/plm/base/plm_base_orted_cmds.c
+++ b/orte/mca/plm/base/plm_base_orted_cmds.c
@ -41,7 +41,7 @@
 #include "orte/mca/plm/base/base.h"
 #include "orte/mca/plm/base/plm_private.h"

-static opal_event_t *ev;
+static opal_event_t *ev=NULL;
 static orte_vpid_t num_reported, num_being_sent;
 static bool done_reporting;

@ -63,10 +63,19 @@ static void send_callback(int status,
                          orte_rml_tag_t tag,
                          void* cbdata)
 {
+    ORTE_OUTPUT_VERBOSE((5, orte_plm_globals.output,
+                         "%s plm:base:orted_cmd message to %s sent",
+                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                         ORTE_NAME_PRINT(peer)));
+    
    num_reported++;
    if (num_reported == num_being_sent) {
        /* cancel the timer */
-        opal_event_del(ev);
+        if (NULL != ev) {
+            opal_event_del(ev);
+            ev = NULL;
+        }
+        
        /* mark as done */
        done_reporting = true;
        
@ -138,35 +147,22 @@ int orte_plm_base_orted_exit(void)
                             "%s plm:base:orted_cmd:orted_exit abnormal term ordered",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        
-        /* since we cannot know which daemons may/may not be alive,
-         * setup an event so we will time out after giving the send
-         * our best attempt
-         */
-        ORTE_DETECT_TIMEOUT(&ev, orte_process_info.num_procs,
-                            orte_timeout_usec_per_proc,
-                            orte_max_timeout, failed_send);
-        /* if I am the HNP, I need to get this message too, but just set things
-         * up so the cmd processor gets called.
-         * We don't want to message ourselves as this can create circular logic
-         * in the RML. Instead, this macro will set a zero-time event which will
-         * cause the buffer to be processed by the cmd processor - probably will
-         * fire right away, but that's okay
-         * The macro makes a copy of the buffer, so it's okay to release it here
-         */
-        if (orte_process_info.hnp) {
-            ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &cmd, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
-        }
-        
        /* now send the command one daemon at a time using a non-blocking
         * send - let the callback function keep track of how many
         * complete - it will delete the event if they all do.
-         * Start with vpid=1 as the HNP gets it another way
+         * Start with vpid=1 as the HNP is told to exit another way
         */
        done_reporting = false;
        num_reported = 0;
-        num_being_sent = orte_process_info.num_procs-1;
+        num_being_sent = 0;
        peer.jobid = ORTE_PROC_MY_NAME->jobid;
-        for(v=1; v < orte_process_info.num_procs; v++) {
+        for(v=1; v < daemons->num_procs; v++) {
+            /* if we don't have contact info for this daemon,
+             * then we know we can't reach it - so don't try
+             */
+            if (NULL == procs[v]->rml_uri) {
+                continue;
+            }
            peer.vpid = v;
            /* check to see if this daemon is known to be "dead" */
            if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
@ -176,18 +172,35 @@ int orte_plm_base_orted_exit(void)
            /* don't worry about errors on the send here - just
             * issue it and keep going
             */
+            ++num_being_sent;
+            ORTE_OUTPUT_VERBOSE((5, orte_plm_globals.output,
+                                 "%s plm:base:orted_cmd:orted_exit sending cmd to %s",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_NAME_PRINT(&peer)));
            orte_rml.send_buffer_nb(&peer, &cmd, ORTE_RML_TAG_DAEMON, 0,
                                    send_callback, 0);
        }
+        OBJ_DESTRUCT(&cmd); /* done with this */
+        
+        /* since we cannot know which daemons may/may not be alive,
+         * setup an event so we will time out after giving the send
+         * our best attempt
+         */
+        ORTE_DETECT_TIMEOUT(&ev, num_being_sent,
+                            1000*orte_timeout_usec_per_proc,
+                            10*orte_max_timeout, failed_send);
+        
        /* wait for completion or timeout */
-        while (!done_reporting) {
-            opal_progress();
+        ORTE_PROGRESSED_WAIT(done_reporting, num_reported, num_being_sent);
+
+        /* cleanup the timer */
+        if (NULL != ev) {
+            opal_event_del(ev);
+            ev = NULL;
        }
-        OBJ_DESTRUCT(&cmd);
        
        /* if all the sends didn't go, report that */
-        if (num_reported != num_being_sent) {
-            orte_show_help("help-plm-base.txt", "incomplete-exit-cmd", true);
+        if (num_reported < num_being_sent) {
            return ORTE_ERR_SILENT;
        }

@ -254,14 +267,6 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
        }
        procs = (orte_proc_t**)daemons->procs->addr;

-        /* since we cannot know which daemons may/may not be alive,
-         * setup an event so we will time out after giving the send
-         * our best attempt
-         */
-        ORTE_DETECT_TIMEOUT(&ev, orte_process_info.num_procs,
-                            orte_timeout_usec_per_proc,
-                            orte_max_timeout, failed_send);
-       
        /* if I am the HNP, I need to get this message too, but just set things
         * up so the cmd processor gets called.
         * We don't want to message ourselves as this can create circular logic
@ -281,9 +286,15 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
         */
        done_reporting = false;
        num_reported = 0;
-        num_being_sent = orte_process_info.num_procs-1;
+        num_being_sent = 0;
        peer.jobid = ORTE_PROC_MY_NAME->jobid;
-        for(v=1; v < orte_process_info.num_procs; v++) {
+        for(v=1; v < daemons->num_procs; v++) {
+            /* if we don't have contact info for this daemon,
+             * then we know we can't reach it - so don't try
+             */
+            if (NULL == procs[v]->rml_uri) {
+                continue;
+            }
            peer.vpid = v;
            /* check to see if this daemon is known to be "dead" */
            if (procs[v]->state > ORTE_PROC_STATE_UNTERMINATED) {
@ -293,18 +304,35 @@ int orte_plm_base_orted_kill_local_procs(orte_jobid_t job)
            /* don't worry about errors on the send here - just
             * issue it and keep going
             */
+            ++num_being_sent;
+            ORTE_OUTPUT_VERBOSE((5, orte_plm_globals.output,
+                                 "%s plm:base:orted_cmd:kill_local_procs sending cmd to %s",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_NAME_PRINT(&peer)));
            orte_rml.send_buffer_nb(&peer, &cmd, ORTE_RML_TAG_DAEMON, 0,
                                send_callback, 0);
        }
+        OBJ_DESTRUCT(&cmd); /* done with this */
+        
+        /* since we cannot know which daemons may/may not be alive,
+         * setup an event so we will time out after giving the send
+         * our best attempt
+         */
+        ORTE_DETECT_TIMEOUT(&ev, num_being_sent,
+                            1000*orte_timeout_usec_per_proc,
+                            10*orte_max_timeout, failed_send);
+        
        /* wait for completion or timeout */
-        while (!done_reporting) {
-            opal_progress();
+        ORTE_PROGRESSED_WAIT(done_reporting, num_reported, num_being_sent);
+        
+        /* cleanup the timer */
+        if (NULL != ev) {
+            opal_event_del(ev);
+            ev = NULL;
        }
-        OBJ_DESTRUCT(&cmd);
        
        /* if all the sends didn't go, report that */
-        if (num_reported != num_being_sent) {
-            orte_show_help("help-plm-base.txt", "incomplete-kill-procs-cmd", true);
+        if (num_reported < num_being_sent) {
            return ORTE_ERR_SILENT;
        }
        
--- a/orte/mca/plm/slurm/plm_slurm_module.c
+++ b/orte/mca/plm/slurm/plm_slurm_module.c
@ -494,19 +494,23 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
       now, though, the only thing that really matters is that
       srun failed. Report the error and make sure that orterun
       wakes up - otherwise, do nothing!
+     
+       Unfortunately, the pid returned here is the srun pid, not the pid of
+       the proc that actually died! So, to avoid confusion, just use -1 as the
+       pid so nobody thinks this is real
    */
    
    if (0 != status) {
        if (failed_launch) {
            /* report that the daemon has failed so we can exit
             */
-            orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_FAILED_TO_START);
+            orte_plm_base_launch_failed(active_job, true, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
            
        } else {
            /* an orted must have died unexpectedly after launch - report
             * that the daemon has failed so we exit
             */
-            orte_plm_base_launch_failed(active_job, true, pid, status, ORTE_JOB_STATE_ABORTED);
+            orte_plm_base_launch_failed(active_job, true, -1, status, ORTE_JOB_STATE_ABORTED);
        }
    }
    
--- a/orte/orted/orted_main.c
+++ b/orte/orted/orted_main.c
@ -107,6 +107,7 @@ static struct {
    char* num_procs;
    int uri_pipe;
    int singleton_died_pipe;
+    int fail;
 } orted_globals;

 /*
@ -122,6 +123,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
      NULL, OPAL_CMD_LINE_TYPE_BOOL,
      "Have the orted spin until we can connect a debugger to it" },

+    { NULL, NULL, NULL, '\0', NULL, "debug-failure", 1,
+        &orted_globals.fail, OPAL_CMD_LINE_TYPE_INT,
+      "Have the specified orted fail after init for debugging purposes" },
+    
    { "orte", "debug", NULL, 'd', NULL, "debug", 0,
        NULL, OPAL_CMD_LINE_TYPE_BOOL,
        "Debug the OpenRTE" },
@ -188,6 +193,8 @@ int orte_daemon(int argc, char *argv[])
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
+    /* init the failure orted vpid to an invalid value */
+    orted_globals.fail = ORTE_VPID_INVALID;
    
    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
@ -293,6 +300,16 @@ int orte_daemon(int argc, char *argv[])
        return ret;
    }
    
+    if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
+        /* Finalize and clean up ourselves */
+        if (ORTE_SUCCESS != (ret = orte_finalize())) {
+            ORTE_ERROR_LOG(ret);
+        }
+        
+        /* return with non-zero status */
+        return -1;
+    }
+
    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
--- a/orte/runtime/orte_globals.c
+++ b/orte/runtime/orte_globals.c
@ -56,6 +56,7 @@ bool orte_help_want_aggregate = true;
 bool orte_help_show_recursions;
 bool orte_params_set = false;
 int orte_debug_verbosity;
+int orted_debug_failure = ORTE_VPID_INVALID;

 int32_t orte_contiguous_nodes;
 int orte_debug_output = -1;
--- a/orte/runtime/orte_globals.h
+++ b/orte/runtime/orte_globals.h
@ -325,6 +325,7 @@ ORTE_DECLSPEC extern bool orte_help_want_aggregate;
 ORTE_DECLSPEC extern bool orte_help_show_recursions;
 ORTE_DECLSPEC extern bool orte_params_set;
 ORTE_DECLSPEC extern int orte_debug_verbosity;
+ORTE_DECLSPEC extern int orted_debug_failure;

 ORTE_DECLSPEC extern char **orte_launch_environ;
 ORTE_DECLSPEC extern opal_pointer_array_t orte_daemonmap;
--- a/orte/runtime/orte_mca_params.c
+++ b/orte/runtime/orte_mca_params.c
@ -75,6 +75,10 @@ int orte_register_params(void)
                                false, false, (int)false, &value);
    orted_spin_flag = OPAL_INT_TO_BOOL(value);

+    mca_base_param_reg_int_name("orte", "daemon_fail",
+                                "Have the specified orted fail after init for debugging purposes",
+                                false, false, (int)false, &orted_debug_failure);
+    
    /* check for timing requests */
    mca_base_param_reg_int_name("orte", "timing",
                                "Request that critical timing loops be measured",
--- a/orte/tools/orterun/orterun.c
+++ b/orte/tools/orterun/orterun.c
@ -570,6 +570,7 @@ static void job_completed(int trigpipe, short event, void *arg)
 {
    int rc;
    orte_job_state_t exit_state;
+    orte_job_t *daemons;
    
    /* flag that we are here to avoid doing it twice */
    if (!opal_atomic_trylock(&orte_job_complete_lock)) { /* returns 1 if already locked */
@ -622,7 +623,12 @@ static void job_completed(int trigpipe, short event, void *arg)
         * so
         */
        opal_event_del(orteds_exit_event);
-        ORTE_DETECT_TIMEOUT(&ev, orte_process_info.num_procs,
+        /* get the orted job data object */
+        if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
+            /* we are totally hozed */
+            goto DONE;
+        }
+        ORTE_DETECT_TIMEOUT(&ev, daemons->num_procs,
                            orte_timeout_usec_per_proc,
                            orte_max_timeout, terminated);
    }