Ensure the LSF ras adds the hosts to the allocation. Correctly handle the semi-colon vs comma situation in hwloc slot_lists
Этот коммит содержится в:
родитель
3f9d9ae8b6
Коммит
960ef34988
@ -1217,8 +1217,8 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
opal_hwloc_resource_type_t rtype,
|
||||
hwloc_cpuset_t cpumask)
|
||||
{
|
||||
char **item;
|
||||
int rc, i, j;
|
||||
char **item, **rngs;
|
||||
int rc, i, j, k;
|
||||
hwloc_obj_t pu;
|
||||
hwloc_cpuset_t pucpus;
|
||||
char **range, **list;
|
||||
@ -1237,13 +1237,8 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
"slot assignment: slot_list == %s",
|
||||
slot_str);
|
||||
|
||||
/* if someone used commas instead of semicolons, let them */
|
||||
if (NULL != strchr(slot_str, ',')) {
|
||||
item = opal_argv_split(slot_str, ',');
|
||||
} else {
|
||||
/* split at ';' */
|
||||
item = opal_argv_split(slot_str, ';');
|
||||
}
|
||||
/* split at ';' */
|
||||
item = opal_argv_split(slot_str, ';');
|
||||
|
||||
/* start with a clean mask */
|
||||
hwloc_bitmap_zero(cpumask);
|
||||
@ -1263,75 +1258,95 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
if (NULL == strchr(item[i], ':')) {
|
||||
/* binding just to the socket level, though
|
||||
* it could specify multiple sockets
|
||||
* Skip the S and look for a ranges
|
||||
*/
|
||||
if (OPAL_SUCCESS != (rc = socket_to_cpu_set(&item[i][1], /* skip the 'S' */
|
||||
topo, rtype, cpumask))) {
|
||||
opal_argv_free(item);
|
||||
return rc;
|
||||
rngs = opal_argv_split(&item[i][1], ',');
|
||||
for (j=0; NULL != rngs[j]; j++) {
|
||||
if (OPAL_SUCCESS != (rc = socket_to_cpu_set(rngs[j], topo, rtype, cpumask))) {
|
||||
opal_argv_free(rngs);
|
||||
opal_argv_free(item);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
opal_argv_free(rngs);
|
||||
} else {
|
||||
/* binding to a socket/whatever specification */
|
||||
if ('S' == item[i][0] ||
|
||||
's' == item[i][0]) {
|
||||
if (OPAL_SUCCESS != (rc = socket_core_to_cpu_set(&item[i][1], /* skip the 'S' */
|
||||
topo, rtype, cpumask))) {
|
||||
opal_argv_free(item);
|
||||
return rc;
|
||||
rngs = opal_argv_split(&item[i][1], ',');
|
||||
for (j=0; NULL != rngs[j]; j++) {
|
||||
if (OPAL_SUCCESS != (rc = socket_core_to_cpu_set(rngs[j], topo, rtype, cpumask))) {
|
||||
opal_argv_free(rngs);
|
||||
opal_argv_free(item);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
opal_argv_free(rngs);
|
||||
} else {
|
||||
if (OPAL_SUCCESS != (rc = socket_core_to_cpu_set(item[i],
|
||||
topo, rtype, cpumask))) {
|
||||
opal_argv_free(item);
|
||||
return rc;
|
||||
rngs = opal_argv_split(item[i], ',');
|
||||
for (j=0; NULL != rngs[j]; j++) {
|
||||
if (OPAL_SUCCESS != (rc = socket_core_to_cpu_set(rngs[j], topo, rtype, cpumask))) {
|
||||
opal_argv_free(rngs);
|
||||
opal_argv_free(item);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
opal_argv_free(rngs);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* just a core specification - see if one or a range was given */
|
||||
range = opal_argv_split(item[i], '-');
|
||||
range_cnt = opal_argv_count(range);
|
||||
/* see if a range was set or not */
|
||||
switch (range_cnt) {
|
||||
case 1: /* only one core, or a list of cores, specified */
|
||||
list = opal_argv_split(range[0], ',');
|
||||
for (j=0; NULL != list[j]; j++) {
|
||||
core_id = atoi(list[j]);
|
||||
/* find the specified available cpu */
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id, rtype))) {
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
return OPAL_ERR_SILENT;
|
||||
rngs = opal_argv_split(item[i], ',');
|
||||
for (k=0; NULL != rngs[k]; k++) {
|
||||
/* just a core specification - see if one or a range was given */
|
||||
range = opal_argv_split(rngs[k], '-');
|
||||
range_cnt = opal_argv_count(range);
|
||||
/* see if a range was set or not */
|
||||
switch (range_cnt) {
|
||||
case 1: /* only one core, or a list of cores, specified */
|
||||
list = opal_argv_split(range[0], ',');
|
||||
for (j=0; NULL != list[j]; j++) {
|
||||
core_id = atoi(list[j]);
|
||||
/* find the specified available cpu */
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id, rtype))) {
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
opal_argv_free(rngs);
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
/* get the available cpus for that object */
|
||||
pucpus = opal_hwloc_base_get_available_cpus(topo, pu);
|
||||
/* set that in the mask */
|
||||
hwloc_bitmap_or(cpumask, cpumask, pucpus);
|
||||
}
|
||||
/* get the available cpus for that object */
|
||||
pucpus = opal_hwloc_base_get_available_cpus(topo, pu);
|
||||
/* set that in the mask */
|
||||
hwloc_bitmap_or(cpumask, cpumask, pucpus);
|
||||
}
|
||||
opal_argv_free(list);
|
||||
break;
|
||||
opal_argv_free(list);
|
||||
break;
|
||||
|
||||
case 2: /* range of core id's was given */
|
||||
lower_range = atoi(range[0]);
|
||||
upper_range = atoi(range[1]);
|
||||
for (core_id=lower_range; core_id <= upper_range; core_id++) {
|
||||
/* find the specified logical available cpu */
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id, rtype))) {
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
return OPAL_ERR_SILENT;
|
||||
case 2: /* range of core id's was given */
|
||||
lower_range = atoi(range[0]);
|
||||
upper_range = atoi(range[1]);
|
||||
for (core_id=lower_range; core_id <= upper_range; core_id++) {
|
||||
/* find the specified logical available cpu */
|
||||
if (NULL == (pu = opal_hwloc_base_get_pu(topo, core_id, rtype))) {
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
opal_argv_free(rngs);
|
||||
return OPAL_ERR_SILENT;
|
||||
}
|
||||
/* get the available cpus for that object */
|
||||
pucpus = opal_hwloc_base_get_available_cpus(topo, pu);
|
||||
/* set that in the mask */
|
||||
hwloc_bitmap_or(cpumask, cpumask, pucpus);
|
||||
}
|
||||
/* get the available cpus for that object */
|
||||
pucpus = opal_hwloc_base_get_available_cpus(topo, pu);
|
||||
/* set that in the mask */
|
||||
hwloc_bitmap_or(cpumask, cpumask, pucpus);
|
||||
}
|
||||
break;
|
||||
break;
|
||||
|
||||
default:
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
return OPAL_ERROR;
|
||||
default:
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
opal_argv_free(rngs);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
opal_argv_free(rngs);
|
||||
}
|
||||
}
|
||||
opal_argv_free(item);
|
||||
|
@ -59,8 +59,9 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
char **nodelist;
|
||||
orte_node_t *node;
|
||||
int i, num_nodes;
|
||||
char *affinity_file;
|
||||
|
||||
char *affinity_file, *hstname;
|
||||
bool found;
|
||||
|
||||
/* check for an affinity file */
|
||||
if (NULL != (affinity_file = getenv("LSB_AFFINITY_HOSTFILE"))) {
|
||||
/* the affinity file sequentially lists rank locations, with
|
||||
@ -70,7 +71,7 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
jdata->map = OBJ_NEW(orte_job_map_t);
|
||||
}
|
||||
ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_SEQ)
|
||||
jdata->map->req_mapper = strdup("seq"); // need sequential mapper
|
||||
jdata->map->req_mapper = strdup("seq"); // need sequential mapper
|
||||
/* tell the sequential mapper that all cpusets are to be treated as "physical" */
|
||||
orte_set_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, true, NULL, OPAL_BOOL);
|
||||
/* get the apps and set the hostfile attribute in each to point to
|
||||
@ -81,6 +82,38 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
|
||||
}
|
||||
orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, true, (void*)&affinity_file, OPAL_STRING);
|
||||
}
|
||||
/* read the specified file to get the allocation */
|
||||
fp = fopen(affinity_file, "r");
|
||||
if (NULL == fp) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
while (NULL != (hstname = orte_getline(fp))) {
|
||||
if (0 == strlen(hstname)) {
|
||||
/* blank line - ignore */
|
||||
continue;
|
||||
}
|
||||
/* see if we already have this node */
|
||||
found = false;
|
||||
OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
|
||||
if (0 == strcmp(node->name, hstname)) {
|
||||
/* just increment the slots */
|
||||
++node->slots;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
node = OBJ_NEW(orte_node_t);
|
||||
node->name = strdup(hstname);
|
||||
node->slots_inuse = 0;
|
||||
node->slots_max = 0;
|
||||
node->slots = 1;
|
||||
opal_list_append(nodes, &node->super);
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user