Resolve a race condition when running under a SLURM environment.

The slurm plm fork/exec's a call to srun to launch its daemons. When mpirun terminates, it then sends out a "terminate" command to those daemons. The daemons respond back to mpirun, and then exit. If slurm itself is running on a slow network, and mpirun is running the OOB across a fast network, then it is possible for mpirun to receive notification of daemon termination and exit -before- the srun can complete its bookkeeping and declare the job as complete. When this happens, slurm becomes confused and loses state. Mucho bad. :-/ This commit changes the termination logic so that mpirun will wait for srun to report complete before exiting. It also enables fully routed communications since it no longer requires daemons to report back that they are terminating, thus allowing the daemons to terminate asynchronously (thereby breaking routing paths). This commit was SVN r20018.
2008-11-18 13:59:23 +00:00 · 2008-11-18 13:59:23 +00:00 · 89559396ea
--- a/orte/mca/plm/slurm/plm_slurm_module.c
+++ b/orte/mca/plm/slurm/plm_slurm_module.c
@ -101,9 +101,10 @@ orte_plm_base_module_1_0_0_t orte_plm_slurm_module = {
 /*
 * Local variables
 */
-static pid_t srun_pid = 0;
+static pid_t primary_srun_pid = 0;
+static bool primary_pid_set = false;
 static orte_jobid_t active_job = ORTE_JOBID_INVALID;
-static bool failed_launch;
+static bool launching_daemons;


 /**
@ -148,6 +149,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
    int proc_vpid_index;
    orte_jobid_t failed_job;
    orte_job_state_t job_state = ORTE_JOB_NEVER_LAUNCHED;
+    bool failed_launch=false;
    
    /* flag the daemons as failing by default */
    failed_job = ORTE_PROC_MY_NAME->jobid;
@ -161,7 +163,7 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
    }
    
    /* indicate the state of the launch */
-    failed_launch = true;
+    launching_daemons = true;
    
    /* create a jobid for this job */
    if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
@ -368,7 +370,10 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
    }
    
 launch_apps:
-    /* get here if daemons launch okay - any failures now by apps */
+    /* get here if daemons launch okay, or no daemons need to be launched - any
+     * failures now are from launching apps
+     */
+    launching_daemons = false;
    failed_job = active_job;
    if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
@ -437,15 +442,10 @@ static int plm_slurm_terminate_orteds(void)
 {
    int rc;
    
-    /* deregister the waitpid callback to ensure we don't make it look like
-     * srun failed when it didn't. Since the srun may have already completed,
-     * do NOT ERROR_LOG any return code to avoid confusing, duplicate error
-     * messages
+    /* tell them to die without sending a reply - we will rely on the
+     * waitpid to tell us when they have exited!
     */
-    orte_wait_cb_cancel(srun_pid);
-    
-    /* tell them to die! */
-    if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_WITH_REPLY_CMD))) {
+    if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_NO_REPLY_CMD))) {
        ORTE_ERROR_LOG(rc);
    }
    
@ -483,6 +483,8 @@ static int plm_slurm_finalize(void)


 static void srun_wait_cb(pid_t pid, int status, void* cbdata){
+    orte_job_t *jdata;
+    
    /* According to the SLURM folks, srun always returns the highest exit
       code of our remote processes. Thus, a non-zero exit status doesn't
       necessarily mean that srun failed - it could be that an orted returned
@ -503,20 +505,41 @@ static void srun_wait_cb(pid_t pid, int status, void* cbdata){
       pid so nobody thinks this is real
    */
    
-    if (0 != status) {
-        if (failed_launch) {
-            /* report that the daemon has failed so we can exit
-             */
-            orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
-            
-        } else {
+    /* if we are in the launch phase, then any termination is bad */
+    if (launching_daemons) {
+        /* report that one or more daemons failed to launch so we can exit */
+        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
+                             "%s plm:slurm: daemon failed during launch",
+                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+        orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_FAILED_TO_START);
+    } else {
+        /* if this is after launch, then we need to abort only if the status
+         * returned is non-zero - i.e., if the orteds exited with an error
+         */
+        if (0 != status) {
            /* an orted must have died unexpectedly after launch - report
             * that the daemon has failed so we exit
             */
+            OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
+                                 "%s plm:slurm: daemon failed while running",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, -1, status, ORTE_JOB_STATE_ABORTED);
        }
+        /* otherwise, check to see if this is the primary pid */
+        if (primary_srun_pid == pid) {
+            /* in this case, we just want to fire the proper trigger so
+             * mpirun can exit
+             */
+            OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
+                                 "%s plm:slurm: primary daemons complete!",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+            jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
+            jdata->state = ORTE_JOB_STATE_TERMINATED;
+            /* need to set the #terminated value to avoid an incorrect error msg */
+            jdata->num_terminated = jdata->num_procs;
+            orte_trigger_event(&orteds_exit);
+        }
    }
-    
 }


@ -524,6 +547,7 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
                                char *prefix)
 {
    int fd;
+    int srun_pid;
    char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);

    if (NULL == exec_argv) {
@ -627,6 +651,14 @@ static int plm_slurm_start_proc(int argc, char **argv, char **env,
        /* setup the waitpid so we can find out if srun succeeds! */
        orte_wait_cb(srun_pid, srun_wait_cb, NULL);
        free(exec_argv);
+        
+        /* if this is the primary launch - i.e., not a comm_spawn of a
+         * child job - then save the pid
+         */
+        if (!primary_pid_set) {
+            primary_srun_pid = srun_pid;
+            primary_pid_set = true;
+        }
    }

    return ORTE_SUCCESS;