Okay, get srun to play nice. Problem was that everything worked fine so long as the user did "salloc" with an argument requesting a specific number of nodes. However, if the user specified instead a number of processes, then we launched that number of daemons - resulting in multiple daemons/node. Not good.
So force things to behave correctly either way. This commit was SVN r25792.
Этот коммит содержится в:
родитель
ef94e606c7
Коммит
07f3a91075
@ -257,9 +257,6 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
/* add the srun command */
|
||||
opal_argv_append(&argc, &argv, "srun");
|
||||
|
||||
/* ensure we only launch one daemon/node */
|
||||
opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
|
||||
|
||||
/* alert us if any orteds die during startup */
|
||||
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
|
||||
|
||||
@ -308,15 +305,16 @@ static int plm_slurm_launch_job(orte_job_t *jdata)
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
asprintf(&tmp, "--nodelist=%s", nodelist_flat);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
/* tell srun how many tasks to run */
|
||||
asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons);
|
||||
opal_argv_append(&argc, &argv, tmp);
|
||||
free(tmp);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output,
|
||||
"%s plm:slurm: launching on nodes %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user