1
1

Once again, the Slurm folks have decided to redefine their envars, reversing what they had previously told us to do. So cleanup the Slurm allocation code, and also adjust to a change in srun behavior that now aborts a job if the ntasks-per-node doesn't get specified when ORTE calls it, but the user specified it when getting an allocation. Sigh.

cmr=v1.7.4:reviewer=miked:subject=Update Slurm allocation and launch

This commit was SVN r29849.
Этот коммит содержится в:
Ralph Castain 2013-12-09 17:58:46 +00:00
родитель e45412f5db
Коммит 83e59e6761
2 изменённых файлов: 14 добавлений и 13 удалений

Просмотреть файл

@ -263,6 +263,9 @@ static void launch_daemons(int fd, short args, void *cbdata)
/* add the srun command */
opal_argv_append(&argc, &argv, "srun");
/* start one orted on each node */
opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
/* alert us if any orteds die during startup */
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");

Просмотреть файл

@ -11,7 +11,8 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -253,9 +254,9 @@ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
/* see if dynamic allocation is enabled */
if (mca_ras_slurm_component.dyn_alloc_enabled) {
/* attempt to get the allocation - the function
* dyn_allocate will return as ORTE_ERR_ALLOCATION_PENDING
* if it succeeds in sending the allocation request
*/
* dyn_allocate will return as ORTE_ERR_ALLOCATION_PENDING
* if it succeeds in sending the allocation request
*/
ret = dyn_allocate(jdata);
/* return to the above layer in ras/base/ras_base_allocate.c
* to wait for event (libevent) happening
@ -268,16 +269,13 @@ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
}
regexp = strdup(slurm_node_str);
tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE");
/* get the number of process slots we were assigned on each node */
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
if (NULL == tasks_per_node) {
/* try an older variation */
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
if (NULL == tasks_per_node) {
/* couldn't find any version - abort */
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
"SLURM_TASKS_PER_NODE");
return ORTE_ERR_NOT_FOUND;
}
/* couldn't find any version - abort */
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
"SLURM_TASKS_PER_NODE");
return ORTE_ERR_NOT_FOUND;
}
node_tasks = strdup(tasks_per_node);