A little cleanup and progress:

- build a proper srun argv - launch the srun - still have several "JMS" comments that need to be addressed This commit was SVN r7036.
2005-08-25 16:38:42 +00:00 · 2005-08-25 16:38:42 +00:00 · 524ded4896
--- a/orte/mca/pls/slurm/pls_slurm_module.c
+++ b/orte/mca/pls/slurm/pls_slurm_module.c
@ -22,9 +22,14 @@

 #include "ompi_config.h"

+#include <sys/types.h>
+#include <unistd.h>
+#include <signal.h>
+
 #include "opal/util/argv.h"
 #include "opal/util/output.h"
 #include "opal/util/opal_environ.h"
+#include "opal/util/path.h"
 #include "opal/mca/base/mca_base_param.h"
 #include "orte/runtime/runtime.h"
 #include "orte/include/orte_constants.h"
@ -46,9 +51,12 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid);
 static int pls_slurm_terminate_proc(const orte_process_name_t *name);
 static int pls_slurm_finalize(void);

-static int pls_slurm_start_proc(char *nodename, int argc, char **argv, 
-                                char **env);
+static int pls_slurm_start_proc(int argc, char **argv, char **env);

+
+/*
+ * Global variable
+ */
 orte_pls_base_module_1_0_0_t orte_pls_slurm_module = {
    pls_slurm_launch,
    pls_slurm_terminate_job,
@ -56,7 +64,15 @@ orte_pls_base_module_1_0_0_t orte_pls_slurm_module = {
    pls_slurm_finalize
 };

+/*
+ * Local variable
+ */
+static pid_t srun_pid = 0;

+
+/*
+ * External
+ */
 extern char **environ;


@ -73,7 +89,10 @@ static int pls_slurm_launch(orte_jobid_t jobid)
    char **argv;
    int argc;
    int rc;
-    
+    char *tmp;
+    char** env;
+    char* var;
+
    /* query the list of nodes allocated to the job - don't need the entire
     * mapping - as the daemon/proxy is responsibe for determining the apps
     * to launch on each node.
@ -106,9 +125,18 @@ static int pls_slurm_launch(orte_jobid_t jobid)
    argv = NULL;
    argc = 0;

+    /* add the srun command */
+    opal_argv_append(&argc, &argv, "srun");
+
+    /* "--mpi=lam" implies the task count equals the node count */
+    opal_argv_append(&argc, &argv, "--mpi=lam");
+
+    asprintf(&tmp, "--nodes=%lu", num_nodes);
+    opal_argv_append(&argc, &argv, tmp);
+    free(tmp);
+
    /* add the daemon command (as specified by user) */
    opal_argv_append(&argc, &argv, mca_pls_slurm_component.orted);
-
    opal_argv_append(&argc, &argv, "--no-daemonize");
    
    /* check for debug flags */
@ -118,27 +146,32 @@ static int pls_slurm_launch(orte_jobid_t jobid)
    opal_argv_append(&argc, &argv, "--bootproxy");
    opal_argv_append(&argc, &argv, jobid_string);
    opal_argv_append(&argc, &argv, "--name");
+    /* JMS: what to do here? */
    proc_name_index = argc;
-    opal_argv_append(&argc, &argv, "");
+    opal_argv_append(&argc, &argv, "BOGUS");

    /* tell the daemon how many procs are in the daemon's job */
+    /* JMS: what does this do -- is it necessary?  i.e., doesn't the
+       daemon pull this info from the gpr? */
    opal_argv_append(&argc, &argv, "--num_procs");
    asprintf(&param, "%lu", (unsigned long)(vpid + num_nodes));
    opal_argv_append(&argc, &argv, param);
    free(param);

    /* tell the daemon the starting vpid of the daemon's job */
+    /* JMS: ditto previous comment */
    opal_argv_append(&argc, &argv, "--vpid_start");
    opal_argv_append(&argc, &argv, "0");
    
    opal_argv_append(&argc, &argv, "--nodename");
    node_name_index = argc;
-    opal_argv_append(&argc, &argv, "");
+    /* JMS: what to do here? */
+    opal_argv_append(&argc, &argv, "BOGUS");

    /* pass along the universe name and location info */
    opal_argv_append(&argc, &argv, "--universe");
    asprintf(&param, "%s@%s:%s", orte_universe_info.uid,
-                orte_universe_info.host, orte_universe_info.name);
+             orte_universe_info.host, orte_universe_info.name);
    opal_argv_append(&argc, &argv, param);
    free(param);
    
@ -175,21 +208,13 @@ static int pls_slurm_launch(orte_jobid_t jobid)
        }
    }

-    /*
-     * Iterate through each of the nodes and spin
-     * up a daemon.
-     */
-    for(item =  opal_list_get_first(&nodes);
-        item != opal_list_get_end(&nodes);
-        item =  opal_list_get_next(item)) {
+    /* Bookkeeping -- save the node names */
+    for (item =  opal_list_get_first(&nodes);
+         item != opal_list_get_end(&nodes);
+         item =  opal_list_get_next(item)) {
        orte_ras_node_t* node = (orte_ras_node_t*)item;
        orte_process_name_t* name;
        char* name_string;
-        char** env;
-        char* var;
-
-        /* setup node name */
-        argv[node_name_index] = node->node_name;

        /* initialize daemons process name */
        rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid);
@ -198,70 +223,61 @@ static int pls_slurm_launch(orte_jobid_t jobid)
            goto cleanup;
        }

-        /* setup per-node options */
-        if (mca_pls_slurm_component.debug) {
-            opal_output(0, "pls:slurm: launching on node %s\n", 
-                        node->node_name);
-        }
-        
        /* setup process name */
        rc = orte_ns.get_proc_name_string(&name_string, name);
        if (ORTE_SUCCESS != rc) {
            opal_output(0, "pls:slurm: unable to create process name");
            exit(-1);
        }
-        argv[proc_name_index] = name_string;
-
-        /* setup environment */
-        env = opal_argv_copy(environ);
-        var = mca_base_param_environ_variable("seed",NULL,NULL);
-        opal_setenv(var, "0", true, &env);
-
-        /* set the progress engine schedule for this node.
-         * if node_slots is set to zero, then we default to
-         * NOT being oversubscribed
-         */
-        if (node->node_slots > 0 &&
-            node->node_slots_inuse > node->node_slots) {
-            if (mca_pls_slurm_component.debug) {
-                opal_output(0, "pls:slurm: oversubscribed -- setting mpi_yield_when_idle to 1");
-            }
-            var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
-            opal_setenv(var, "1", true, &env);
-        } else {
-            if (mca_pls_slurm_component.debug) {
-                opal_output(0, "pls:slurm: not oversubscribed -- setting mpi_yield_when_idle to 0");
-            }
-            var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
-            opal_setenv(var, "0", true, &env);
-        }
-        free(var);

        /* save the daemons name on the node */
-        if (ORTE_SUCCESS != (rc = orte_pls_base_proxy_set_node_name(node,jobid,name))) {
+        if (ORTE_SUCCESS != 
+            (rc = orte_pls_base_proxy_set_node_name(node, jobid, name))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
-    
-        /* exec the daemon */
-        if (mca_pls_slurm_component.debug) {
-            param = opal_argv_join(argv, ' ');
-            if (NULL != param) {
-                opal_output(0, "pls:slurm: executing: %s", param);
-                free(param);
-            }
-        }
-
-        rc = pls_slurm_start_proc(node->node_name, argc, argv, env);
-        if (ORTE_SUCCESS != rc) {
-            opal_output(0, "pls:slurm: start_procs returned error %d", rc);
-            goto cleanup;
-        }

        vpid++;
        free(name);
    }
+    
+    /* setup environment */
+    env = opal_argv_copy(environ);
+    var = mca_base_param_environ_variable("seed", NULL, NULL);
+    opal_setenv(var, "0", true, &env);

+#if 0
+    /* JMS What to do for sched_yield? */
+
+    /* set the progress engine schedule for this node.  if node_slots
+       is set to zero, then we default to NOT being oversubscribed */
+    if (node->node_slots > 0 &&
+        node->node_slots_inuse > node->node_slots) {
+        if (mca_pls_slurm_component.debug) {
+            opal_output(0, "pls:slurm: oversubscribed -- setting mpi_yield_when_idle to 1");
+        }
+        var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
+        opal_setenv(var, "1", true, &env);
+    } else {
+        if (mca_pls_slurm_component.debug) {
+            opal_output(0, "pls:slurm: not oversubscribed -- setting mpi_yield_when_idle to 0");
+        }
+        var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle");
+        opal_setenv(var, "0", true, &env);
+    }
+    free(var);
+#endif
+    
+    /* exec the daemon */
+    rc = pls_slurm_start_proc(argc, argv, env);
+    if (ORTE_SUCCESS != rc) {
+        opal_output(0, "pls:slurm: start_procs returned error %d", rc);
+        goto cleanup;
+    }
+
+    /* JMS: short we stash the srun pid in the gpr somewhere for cleanup? */
+    /* JMS: how do we catch when srun dies? */
+    
 cleanup:
    while (NULL != (item = opal_list_remove_first(&nodes))) {
        OBJ_RELEASE(item);
@ -273,6 +289,11 @@ cleanup:

 static int pls_slurm_terminate_job(orte_jobid_t jobid)
 {
+    if (0 != srun_pid) {
+        kill(srun_pid, SIGHUP);
+        /* JMS need appropriate code here to reap */
+        srun_pid = 0;
+    }
    return orte_pls_base_proxy_terminate_job(jobid);
 }

@ -298,13 +319,24 @@ static int pls_slurm_finalize(void)
 }


-static int pls_slurm_start_proc(char *nodename, int argc, char **argv, 
-                                char **env)
+static int pls_slurm_start_proc(int argc, char **argv, char **env)
 {
-    char *a = opal_argv_join(argv, ' ');
+    int rc;
+    char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);

-    printf("SLURM Starting on node %s: %s\n", nodename, a);
-    free(a);
+    if (NULL == exec_argv) {
+        return ORTE_ERR_NOT_FOUND;
+    }
+
+    srun_pid = fork();
+    if (-1 == srun_pid) {
+        printf("Fork failed!\n");
+        return ORTE_ERR_IN_ERRNO;
+    } else if (0 == srun_pid) {
+        rc = execve(exec_argv, argv, env);
+        printf("execve failed! (%s)\n", argv[0]);
+        return ORTE_ERR_IN_ERRNO;
+    }

    return ORTE_SUCCESS;
 }