Once again, the Slurm folks have decided to redefine their envars, reversing what they had previously told us to do. So cleanup the Slurm allocation code, and also adjust to a change in srun behavior that now aborts a job if the ntasks-per-node doesn't get specified when ORTE calls it, but the user specified it when getting an allocation. Sigh.
cmr=v1.7.4:reviewer=miked:subject=Update Slurm allocation and launch This commit was SVN r29849.
Этот коммит содержится в:
родитель
e45412f5db
Коммит
83e59e6761
@ -263,6 +263,9 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
/* add the srun command */
|
||||
opal_argv_append(&argc, &argv, "srun");
|
||||
|
||||
/* start one orted on each node */
|
||||
opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
|
||||
|
||||
/* alert us if any orteds die during startup */
|
||||
opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
|
||||
|
||||
|
@ -11,7 +11,8 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -253,9 +254,9 @@ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
/* see if dynamic allocation is enabled */
|
||||
if (mca_ras_slurm_component.dyn_alloc_enabled) {
|
||||
/* attempt to get the allocation - the function
|
||||
* dyn_allocate will return as ORTE_ERR_ALLOCATION_PENDING
|
||||
* if it succeeds in sending the allocation request
|
||||
*/
|
||||
* dyn_allocate will return as ORTE_ERR_ALLOCATION_PENDING
|
||||
* if it succeeds in sending the allocation request
|
||||
*/
|
||||
ret = dyn_allocate(jdata);
|
||||
/* return to the above layer in ras/base/ras_base_allocate.c
|
||||
* to wait for event (libevent) happening
|
||||
@ -268,16 +269,13 @@ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
}
|
||||
regexp = strdup(slurm_node_str);
|
||||
|
||||
tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE");
|
||||
/* get the number of process slots we were assigned on each node */
|
||||
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
|
||||
if (NULL == tasks_per_node) {
|
||||
/* try an older variation */
|
||||
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
|
||||
if (NULL == tasks_per_node) {
|
||||
/* couldn't find any version - abort */
|
||||
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
|
||||
"SLURM_TASKS_PER_NODE");
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* couldn't find any version - abort */
|
||||
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
|
||||
"SLURM_TASKS_PER_NODE");
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
node_tasks = strdup(tasks_per_node);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user