Merge pull request #481 from rhc54/topic/slurm
Add new MCA parameter to support edge case with debugger at LLNL
Этот коммит содержится в:
Коммит
b41d2ad6c4
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -37,6 +38,7 @@ typedef struct {
|
||||
bool dyn_alloc_enabled;
|
||||
char *config_file;
|
||||
bool rolling_alloc;
|
||||
bool use_all;
|
||||
} orte_ras_slurm_component_t;
|
||||
ORTE_DECLSPEC extern orte_ras_slurm_component_t mca_ras_slurm_component;
|
||||
|
||||
|
@ -106,6 +106,14 @@ static int ras_slurm_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ras_slurm_component.rolling_alloc);
|
||||
|
||||
mca_ras_slurm_component.use_all = false;
|
||||
(void) mca_base_component_var_register (component, "use_entire_allocation",
|
||||
"Use entire allocation (not just job step nodes) for this application",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_ras_slurm_component.use_all);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -280,39 +280,63 @@ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* get the number of process slots we were assigned on each node */
|
||||
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
|
||||
if (NULL == tasks_per_node) {
|
||||
/* couldn't find any version - abort */
|
||||
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
|
||||
"SLURM_TASKS_PER_NODE");
|
||||
free(regexp);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
node_tasks = strdup(tasks_per_node);
|
||||
if(NULL == node_tasks) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(regexp);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* get the number of CPUs per task that the user provided to slurm */
|
||||
tmp = getenv("SLURM_CPUS_PER_TASK");
|
||||
if(NULL != tmp) {
|
||||
cpus_per_task = atoi(tmp);
|
||||
if(0 >= cpus_per_task) {
|
||||
opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. "
|
||||
"Variable was: %s\n", tmp);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
free(node_tasks);
|
||||
if (mca_ras_slurm_component.use_all) {
|
||||
/* this is an oddball case required for debug situations where
|
||||
* a tool is started that will then call mpirun. In this case,
|
||||
* Slurm will assign only 1 tasks/per node to the tool, but
|
||||
* we want mpirun to use the entire allocation. They don't give
|
||||
* us a specific variable for this purpose, so we have to fudge
|
||||
* a bit - but this is a special edge case, and we'll live with it */
|
||||
tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE");
|
||||
if (NULL == tasks_per_node) {
|
||||
/* couldn't find any version - abort */
|
||||
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
|
||||
"SLURM_JOB_CPUS_PER_NODE");
|
||||
free(regexp);
|
||||
return ORTE_ERROR;
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
node_tasks = strdup(tasks_per_node);
|
||||
if (NULL == node_tasks) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(regexp);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
} else {
|
||||
cpus_per_task = 1;
|
||||
} else {
|
||||
/* get the number of process slots we were assigned on each node */
|
||||
tasks_per_node = getenv("SLURM_TASKS_PER_NODE");
|
||||
if (NULL == tasks_per_node) {
|
||||
/* couldn't find any version - abort */
|
||||
orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1,
|
||||
"SLURM_TASKS_PER_NODE");
|
||||
free(regexp);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
node_tasks = strdup(tasks_per_node);
|
||||
if (NULL == node_tasks) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
free(regexp);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* get the number of CPUs per task that the user provided to slurm */
|
||||
tmp = getenv("SLURM_CPUS_PER_TASK");
|
||||
if(NULL != tmp) {
|
||||
cpus_per_task = atoi(tmp);
|
||||
if(0 >= cpus_per_task) {
|
||||
opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. "
|
||||
"Variable was: %s\n", tmp);
|
||||
ORTE_ERROR_LOG(ORTE_ERROR);
|
||||
free(node_tasks);
|
||||
free(regexp);
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
} else {
|
||||
cpus_per_task = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ret = orte_ras_slurm_discover(regexp, node_tasks, nodes);
|
||||
free(regexp);
|
||||
free(node_tasks);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user