Okay, get srun to play nice. Problem was that everything worked fine so long as the user did "salloc" with an argument requesting a specific number of nodes. However, if the user specified instead a number of processes, then we launched that number of daemons - resulting in multiple daemons/node. Not good.
So force things to behave correctly either way. This commit was SVN r25792.
Этот коммит содержится в:
родитель
ef94e606c7
Коммит
07f3a91075
@ -257,9 +257,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
|||||||
/* add the srun command */
|
/* add the srun command */
|
||||||
opal_argv_append(&argc, &argv, "srun");
|
opal_argv_append(&argc, &argv, "srun");
|
||||||
|
|
||||||
/* ensure we only launch one daemon/node */
|
|
||||||
opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
|
|
||||||
|
|
||||||
/* alert us if any orteds die during startup */
|
/* alert us if any orteds die during startup */
|
||||||
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
|
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
|
||||||
|
|
||||||
@ -308,15 +305,16 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
|||||||
opal_argv_append(&argc, &argv, tmp);
|
opal_argv_append(&argc, &argv, tmp);
|
||||||
free(tmp);
|
free(tmp);
|
||||||
|
|
||||||
asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons);
|
|
||||||
opal_argv_append(&argc, &argv, tmp);
|
|
||||||
free(tmp);
|
|
||||||
|
|
||||||
asprintf(&tmp, "--nodelist=%s", nodelist_flat);
|
asprintf(&tmp, "--nodelist=%s", nodelist_flat);
|
||||||
opal_argv_append(&argc, &argv, tmp);
|
opal_argv_append(&argc, &argv, tmp);
|
||||||
free(tmp);
|
free(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* tell srun how many tasks to run */
|
||||||
|
asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons);
|
||||||
|
opal_argv_append(&argc, &argv, tmp);
|
||||||
|
free(tmp);
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output,
|
||||||
"%s plm:slurm: launching on nodes %s",
|
"%s plm:slurm: launching on nodes %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user