Merge pull request #481 from rhc54/topic/slurm
Add new MCA parameter to support edge case with debugger at LLNL
Этот коммит содержится в:
Коммит
b41d2ad6c4
@ -11,6 +11,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -37,6 +38,7 @@ typedef struct {
|
|||||||
bool dyn_alloc_enabled;
|
bool dyn_alloc_enabled;
|
||||||
char *config_file;
|
char *config_file;
|
||||||
bool rolling_alloc;
|
bool rolling_alloc;
|
||||||
|
bool use_all;
|
||||||
} orte_ras_slurm_component_t;
|
} orte_ras_slurm_component_t;
|
||||||
ORTE_DECLSPEC extern orte_ras_slurm_component_t mca_ras_slurm_component;
|
ORTE_DECLSPEC extern orte_ras_slurm_component_t mca_ras_slurm_component;
|
||||||
|
|
||||||
|
@ -106,6 +106,14 @@ static int ras_slurm_register(void)
|
|||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&mca_ras_slurm_component.rolling_alloc);
|
&mca_ras_slurm_component.rolling_alloc);
|
||||||
|
|
||||||
|
mca_ras_slurm_component.use_all = false;
|
||||||
|
(void) mca_base_component_var_register (component, "use_entire_allocation",
|
||||||
|
"Use entire allocation (not just job step nodes) for this application",
|
||||||
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_9,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&mca_ras_slurm_component.use_all);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -280,39 +280,63 @@ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
|
|||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get the number of process slots we were assigned on each node */
|
|
||||||
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
|
|
||||||
if (NULL == tasks_per_node) {
|
|
||||||
/* couldn't find any version - abort */
|
|
||||||
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
|
|
||||||
"SLURM_TASKS_PER_NODE");
|
|
||||||
free(regexp);
|
|
||||||
return ORTE_ERR_NOT_FOUND;
|
|
||||||
}
|
|
||||||
node_tasks = strdup(tasks_per_node);
|
|
||||||
if(NULL == node_tasks) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
||||||
free(regexp);
|
|
||||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get the number of CPUs per task that the user provided to slurm */
|
if (mca_ras_slurm_component.use_all) {
|
||||||
tmp = getenv("SLURM_CPUS_PER_TASK");
|
/* this is an oddball case required for debug situations where
|
||||||
if(NULL != tmp) {
|
* a tool is started that will then call mpirun. In this case,
|
||||||
cpus_per_task = atoi(tmp);
|
* Slurm will assign only 1 tasks/per node to the tool, but
|
||||||
if(0 >= cpus_per_task) {
|
* we want mpirun to use the entire allocation. They don't give
|
||||||
opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. "
|
* us a specific variable for this purpose, so we have to fudge
|
||||||
"Variable was: %s\n", tmp);
|
* a bit - but this is a special edge case, and we'll live with it */
|
||||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE");
|
||||||
free(node_tasks);
|
if (NULL == tasks_per_node) {
|
||||||
|
/* couldn't find any version - abort */
|
||||||
|
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
|
||||||
|
"SLURM_JOB_CPUS_PER_NODE");
|
||||||
free(regexp);
|
free(regexp);
|
||||||
return ORTE_ERROR;
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
node_tasks = strdup(tasks_per_node);
|
||||||
|
if (NULL == node_tasks) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
free(regexp);
|
||||||
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
cpus_per_task = 1;
|
cpus_per_task = 1;
|
||||||
|
} else {
|
||||||
|
/* get the number of process slots we were assigned on each node */
|
||||||
|
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
|
||||||
|
if (NULL == tasks_per_node) {
|
||||||
|
/* couldn't find any version - abort */
|
||||||
|
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
|
||||||
|
"SLURM_TASKS_PER_NODE");
|
||||||
|
free(regexp);
|
||||||
|
return ORTE_ERR_NOT_FOUND;
|
||||||
|
}
|
||||||
|
node_tasks = strdup(tasks_per_node);
|
||||||
|
if (NULL == node_tasks) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
free(regexp);
|
||||||
|
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* get the number of CPUs per task that the user provided to slurm */
|
||||||
|
tmp = getenv("SLURM_CPUS_PER_TASK");
|
||||||
|
if(NULL != tmp) {
|
||||||
|
cpus_per_task = atoi(tmp);
|
||||||
|
if(0 >= cpus_per_task) {
|
||||||
|
opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. "
|
||||||
|
"Variable was: %s\n", tmp);
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||||
|
free(node_tasks);
|
||||||
|
free(regexp);
|
||||||
|
return ORTE_ERROR;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cpus_per_task = 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = orte_ras_slurm_discover(regexp, node_tasks, nodes);
|
ret = orte_ras_slurm_discover(regexp, node_tasks, nodes);
|
||||||
free(regexp);
|
free(regexp);
|
||||||
free(node_tasks);
|
free(node_tasks);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user