1
1

Once again, the Slurm folks have decided to redefine their envars, reversing what they had previously told us to do. So cleanup the Slurm allocation code, and also adjust to a change in srun behavior that now aborts a job if the ntasks-per-node doesn't get specified when ORTE calls it, but the user specified it when getting an allocation. Sigh.

cmr=v1.7.4:reviewer=miked:subject=Update Slurm allocation and launch

This commit was SVN r29849.
Этот коммит содержится в:
Ralph Castain 2013-12-09 17:58:46 +00:00
родитель e45412f5db
Коммит 83e59e6761
2 изменённых файлов: 14 добавлений и 13 удалений

Просмотреть файл

@ -263,6 +263,9 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* add the srun command */
opal_argv_append(&argc, &argv, "srun");
/* start one orted on each node */
opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
/* alert us if any orteds die during startup */
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -268,9 +269,7 @@ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
}
regexp = strdup(slurm_node_str);
tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE");
if (NULL == tasks_per_node) {
/* try an older variation */
/* get the number of process slots we were assigned on each node */
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
if (NULL == tasks_per_node) {
/* couldn't find any version - abort */
@ -278,7 +277,6 @@ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
"SLURM_TASKS_PER_NODE");
return ORTE_ERR_NOT_FOUND;
}
}
node_tasks = strdup(tasks_per_node);
if(NULL == regexp || NULL == node_tasks) {