Handle the case where a user's rankfile specifies only cpus, and not socket:cpu pairs.
This commit was SVN r25803.
Этот коммит содержится в:
родитель
61ac2bb11b
Коммит
3f31feee6f
@ -847,7 +847,7 @@ static int socket_core_to_cpu_set(char *socket_core_list,
|
||||
opal_argv_free(range);
|
||||
}
|
||||
} else {
|
||||
/* we don't support other levels yet */
|
||||
/* unrecognized option */
|
||||
rc = OPAL_ERR_NOT_SUPPORTED;
|
||||
break;
|
||||
}
|
||||
@ -863,7 +863,12 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
{
|
||||
char **item;
|
||||
int rc, i;
|
||||
|
||||
hwloc_obj_t pu;
|
||||
hwloc_cpuset_t pucpus;
|
||||
char **range;
|
||||
size_t range_cnt;
|
||||
int core_id, lower_range, upper_range;
|
||||
|
||||
/* bozo checks */
|
||||
if (NULL == opal_hwloc_topology) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
@ -904,9 +909,49 @@ int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* we don't support other things yet */
|
||||
opal_argv_free(item);
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
/* just a core specification - see if one or a range was given */
|
||||
range = opal_argv_split(item[i], '-');
|
||||
range_cnt = opal_argv_count(range);
|
||||
hwloc_bitmap_zero(cpumask);
|
||||
/* see if a range was set or not */
|
||||
switch (range_cnt) {
|
||||
case 1: /* only one core specified */
|
||||
core_id = atoi(range[0]);
|
||||
/* find the specified logical available cpu */
|
||||
if (NULL == (pu = get_pu(topo, core_id))) {
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
/* get the available cpus for that object */
|
||||
pucpus = opal_hwloc_base_get_available_cpus(topo, pu);
|
||||
/* set that in the mask */
|
||||
hwloc_bitmap_copy(cpumask, pucpus);
|
||||
break;
|
||||
|
||||
case 2: /* range of core id's was given */
|
||||
lower_range = atoi(range[0]);
|
||||
upper_range = atoi(range[1]);
|
||||
hwloc_bitmap_zero(cpumask);
|
||||
for (core_id=lower_range; core_id <= upper_range; core_id++) {
|
||||
/* find the specified logical available cpu */
|
||||
if (NULL == (pu = get_pu(topo, core_id))) {
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
/* get the available cpus for that object */
|
||||
pucpus = opal_hwloc_base_get_available_cpus(topo, pu);
|
||||
/* set that in the mask */
|
||||
hwloc_bitmap_or(cpumask, cpumask, pucpus);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
opal_argv_free(range);
|
||||
opal_argv_free(item);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
opal_argv_free(item);
|
||||
|
@ -79,7 +79,11 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
int rc;
|
||||
int32_t ljob;
|
||||
int i;
|
||||
|
||||
orte_node_t *node;
|
||||
#if OPAL_HAVE_HWLOC
|
||||
hwloc_topology_t t0;
|
||||
#endif
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:setup_job for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -124,6 +128,25 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* if we are not going to launch, then we need to set any
|
||||
* undefined topologies to match our own so the mapper
|
||||
* can operate
|
||||
*/
|
||||
if (orte_do_not_launch) {
|
||||
node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
||||
t0 = node->topology;
|
||||
for (i=1; i < orte_node_pool->size; i++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
|
||||
continue;
|
||||
}
|
||||
if (NULL == node->topology) {
|
||||
node->topology = t0;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* map the job */
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdata))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
|
@ -131,15 +131,14 @@ You can override this protection by adding the "overload-allowed"
|
||||
option to your binding directive.
|
||||
#
|
||||
[rmaps:no-topology]
|
||||
A request was made for nperxxx that requires knowledge of
|
||||
A mapping directive was given that requires knowledge of
|
||||
a remote node's topology. However, no topology info is
|
||||
available for the following node:
|
||||
|
||||
Node: %s
|
||||
|
||||
The job cannot be executed under this condition. Please either
|
||||
remove the nperxxx directive and specify the number of processes
|
||||
to use, or investigate the lack of topology info.
|
||||
remove the directive or investigate the lack of topology info.
|
||||
#
|
||||
[rmaps:no-available-cpus]
|
||||
While computing bindings, we found no available cpus on
|
||||
|
@ -512,6 +512,11 @@ static int bind_in_place(orte_job_t *jdata,
|
||||
|
||||
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
||||
{
|
||||
if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
/* user specified binding by rankfile - nothing for us to do */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) ||
|
||||
OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
/* no binding requested */
|
||||
|
@ -107,6 +107,8 @@ static int orte_rmaps_rank_file_open(void)
|
||||
}
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYUSER);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
/* we are going to bind to cpuset since the user is specifying the cpus */
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET);
|
||||
/* make us first */
|
||||
my_priority = 10000;
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user