diff --git a/orte/mca/pls/slurm/pls_slurm_module.c b/orte/mca/pls/slurm/pls_slurm_module.c index 1dd03e71b6..6960f2c1e4 100644 --- a/orte/mca/pls/slurm/pls_slurm_module.c +++ b/orte/mca/pls/slurm/pls_slurm_module.c @@ -22,9 +22,14 @@ #include "ompi_config.h" +#include +#include +#include + #include "opal/util/argv.h" #include "opal/util/output.h" #include "opal/util/opal_environ.h" +#include "opal/util/path.h" #include "opal/mca/base/mca_base_param.h" #include "orte/runtime/runtime.h" #include "orte/include/orte_constants.h" @@ -46,9 +51,12 @@ static int pls_slurm_terminate_job(orte_jobid_t jobid); static int pls_slurm_terminate_proc(const orte_process_name_t *name); static int pls_slurm_finalize(void); -static int pls_slurm_start_proc(char *nodename, int argc, char **argv, - char **env); +static int pls_slurm_start_proc(int argc, char **argv, char **env); + +/* + * Global variable + */ orte_pls_base_module_1_0_0_t orte_pls_slurm_module = { pls_slurm_launch, pls_slurm_terminate_job, @@ -56,7 +64,15 @@ orte_pls_base_module_1_0_0_t orte_pls_slurm_module = { pls_slurm_finalize }; +/* + * Local variable + */ +static pid_t srun_pid = 0; + +/* + * External + */ extern char **environ; @@ -73,7 +89,10 @@ static int pls_slurm_launch(orte_jobid_t jobid) char **argv; int argc; int rc; - + char *tmp; + char** env; + char* var; + /* query the list of nodes allocated to the job - don't need the entire * mapping - as the daemon/proxy is responsibe for determining the apps * to launch on each node. @@ -106,9 +125,18 @@ static int pls_slurm_launch(orte_jobid_t jobid) argv = NULL; argc = 0; + /* add the srun command */ + opal_argv_append(&argc, &argv, "srun"); + + /* "--mpi=lam" implies the task count equals the node count */ + opal_argv_append(&argc, &argv, "--mpi=lam"); + + asprintf(&tmp, "--nodes=%lu", num_nodes); + opal_argv_append(&argc, &argv, tmp); + free(tmp); + /* add the daemon command (as specified by user) */ opal_argv_append(&argc, &argv, mca_pls_slurm_component.orted); - opal_argv_append(&argc, &argv, "--no-daemonize"); /* check for debug flags */ @@ -118,27 +146,32 @@ static int pls_slurm_launch(orte_jobid_t jobid) opal_argv_append(&argc, &argv, "--bootproxy"); opal_argv_append(&argc, &argv, jobid_string); opal_argv_append(&argc, &argv, "--name"); + /* JMS: what to do here? */ proc_name_index = argc; - opal_argv_append(&argc, &argv, ""); + opal_argv_append(&argc, &argv, "BOGUS"); /* tell the daemon how many procs are in the daemon's job */ + /* JMS: what does this do -- is it necessary? i.e., doesn't the + daemon pull this info from the gpr? */ opal_argv_append(&argc, &argv, "--num_procs"); asprintf(¶m, "%lu", (unsigned long)(vpid + num_nodes)); opal_argv_append(&argc, &argv, param); free(param); /* tell the daemon the starting vpid of the daemon's job */ + /* JMS: ditto previous comment */ opal_argv_append(&argc, &argv, "--vpid_start"); opal_argv_append(&argc, &argv, "0"); opal_argv_append(&argc, &argv, "--nodename"); node_name_index = argc; - opal_argv_append(&argc, &argv, ""); + /* JMS: what to do here? */ + opal_argv_append(&argc, &argv, "BOGUS"); /* pass along the universe name and location info */ opal_argv_append(&argc, &argv, "--universe"); asprintf(¶m, "%s@%s:%s", orte_universe_info.uid, - orte_universe_info.host, orte_universe_info.name); + orte_universe_info.host, orte_universe_info.name); opal_argv_append(&argc, &argv, param); free(param); @@ -175,21 +208,13 @@ static int pls_slurm_launch(orte_jobid_t jobid) } } - /* - * Iterate through each of the nodes and spin - * up a daemon. - */ - for(item = opal_list_get_first(&nodes); - item != opal_list_get_end(&nodes); - item = opal_list_get_next(item)) { + /* Bookkeeping -- save the node names */ + for (item = opal_list_get_first(&nodes); + item != opal_list_get_end(&nodes); + item = opal_list_get_next(item)) { orte_ras_node_t* node = (orte_ras_node_t*)item; orte_process_name_t* name; char* name_string; - char** env; - char* var; - - /* setup node name */ - argv[node_name_index] = node->node_name; /* initialize daemons process name */ rc = orte_ns.create_process_name(&name, node->node_cellid, 0, vpid); @@ -198,70 +223,61 @@ static int pls_slurm_launch(orte_jobid_t jobid) goto cleanup; } - /* setup per-node options */ - if (mca_pls_slurm_component.debug) { - opal_output(0, "pls:slurm: launching on node %s\n", - node->node_name); - } - /* setup process name */ rc = orte_ns.get_proc_name_string(&name_string, name); if (ORTE_SUCCESS != rc) { opal_output(0, "pls:slurm: unable to create process name"); exit(-1); } - argv[proc_name_index] = name_string; - - /* setup environment */ - env = opal_argv_copy(environ); - var = mca_base_param_environ_variable("seed",NULL,NULL); - opal_setenv(var, "0", true, &env); - - /* set the progress engine schedule for this node. - * if node_slots is set to zero, then we default to - * NOT being oversubscribed - */ - if (node->node_slots > 0 && - node->node_slots_inuse > node->node_slots) { - if (mca_pls_slurm_component.debug) { - opal_output(0, "pls:slurm: oversubscribed -- setting mpi_yield_when_idle to 1"); - } - var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle"); - opal_setenv(var, "1", true, &env); - } else { - if (mca_pls_slurm_component.debug) { - opal_output(0, "pls:slurm: not oversubscribed -- setting mpi_yield_when_idle to 0"); - } - var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle"); - opal_setenv(var, "0", true, &env); - } - free(var); /* save the daemons name on the node */ - if (ORTE_SUCCESS != (rc = orte_pls_base_proxy_set_node_name(node,jobid,name))) { + if (ORTE_SUCCESS != + (rc = orte_pls_base_proxy_set_node_name(node, jobid, name))) { ORTE_ERROR_LOG(rc); goto cleanup; } - - /* exec the daemon */ - if (mca_pls_slurm_component.debug) { - param = opal_argv_join(argv, ' '); - if (NULL != param) { - opal_output(0, "pls:slurm: executing: %s", param); - free(param); - } - } - - rc = pls_slurm_start_proc(node->node_name, argc, argv, env); - if (ORTE_SUCCESS != rc) { - opal_output(0, "pls:slurm: start_procs returned error %d", rc); - goto cleanup; - } vpid++; free(name); } + + /* setup environment */ + env = opal_argv_copy(environ); + var = mca_base_param_environ_variable("seed", NULL, NULL); + opal_setenv(var, "0", true, &env); +#if 0 + /* JMS What to do for sched_yield? */ + + /* set the progress engine schedule for this node. if node_slots + is set to zero, then we default to NOT being oversubscribed */ + if (node->node_slots > 0 && + node->node_slots_inuse > node->node_slots) { + if (mca_pls_slurm_component.debug) { + opal_output(0, "pls:slurm: oversubscribed -- setting mpi_yield_when_idle to 1"); + } + var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle"); + opal_setenv(var, "1", true, &env); + } else { + if (mca_pls_slurm_component.debug) { + opal_output(0, "pls:slurm: not oversubscribed -- setting mpi_yield_when_idle to 0"); + } + var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle"); + opal_setenv(var, "0", true, &env); + } + free(var); +#endif + + /* exec the daemon */ + rc = pls_slurm_start_proc(argc, argv, env); + if (ORTE_SUCCESS != rc) { + opal_output(0, "pls:slurm: start_procs returned error %d", rc); + goto cleanup; + } + + /* JMS: short we stash the srun pid in the gpr somewhere for cleanup? */ + /* JMS: how do we catch when srun dies? */ + cleanup: while (NULL != (item = opal_list_remove_first(&nodes))) { OBJ_RELEASE(item); @@ -273,6 +289,11 @@ cleanup: static int pls_slurm_terminate_job(orte_jobid_t jobid) { + if (0 != srun_pid) { + kill(srun_pid, SIGHUP); + /* JMS need appropriate code here to reap */ + srun_pid = 0; + } return orte_pls_base_proxy_terminate_job(jobid); } @@ -298,13 +319,24 @@ static int pls_slurm_finalize(void) } -static int pls_slurm_start_proc(char *nodename, int argc, char **argv, - char **env) +static int pls_slurm_start_proc(int argc, char **argv, char **env) { - char *a = opal_argv_join(argv, ' '); + int rc; + char *exec_argv = opal_path_findv(argv[0], 0, env, NULL); - printf("SLURM Starting on node %s: %s\n", nodename, a); - free(a); + if (NULL == exec_argv) { + return ORTE_ERR_NOT_FOUND; + } + + srun_pid = fork(); + if (-1 == srun_pid) { + printf("Fork failed!\n"); + return ORTE_ERR_IN_ERRNO; + } else if (0 == srun_pid) { + rc = execve(exec_argv, argv, env); + printf("execve failed! (%s)\n", argv[0]); + return ORTE_ERR_IN_ERRNO; + } return ORTE_SUCCESS; }