Merge pull request #6584 from markalle/cpu_set
binding: -cpu-set as a constraint rather than as a binding
Этот коммит содержится в:
Коммит
07e5c54a46
@ -3,6 +3,7 @@
|
|||||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
|
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2016-2017 Research Organization for Information Science
|
* Copyright (c) 2016-2017 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2019 IBM Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -217,8 +218,15 @@ static int opal_hwloc_base_open(mca_base_open_flag_t flags)
|
|||||||
* we do bind to the given cpus if provided, otherwise this would be
|
* we do bind to the given cpus if provided, otherwise this would be
|
||||||
* ignored if someone didn't also specify a binding policy
|
* ignored if someone didn't also specify a binding policy
|
||||||
*/
|
*/
|
||||||
|
// Restoring pre ef86707fbe3392c8ed15f79cc4892f0313b409af behavior.
|
||||||
|
// Formerly -cpu-set #,#,# along with -use_hwthread-cpus resulted
|
||||||
|
// in the binding policy staying OPAL_BIND_TO_HWTHREAD
|
||||||
|
// I think that should be right because I thought -cpu-set was a contraint you put
|
||||||
|
// on another binding policy, not a binding policy in itself.
|
||||||
|
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
||||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET);
|
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* if we are binding to hwthreads, then we must use hwthreads as cpus */
|
/* if we are binding to hwthreads, then we must use hwthreads as cpus */
|
||||||
if (OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) == OPAL_BIND_TO_HWTHREAD) {
|
if (OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) == OPAL_BIND_TO_HWTHREAD) {
|
||||||
|
@ -765,15 +765,34 @@ static hwloc_obj_t df_search(hwloc_topology_t topo,
|
|||||||
return found;
|
return found;
|
||||||
}
|
}
|
||||||
if (OPAL_HWLOC_AVAILABLE == rtype) {
|
if (OPAL_HWLOC_AVAILABLE == rtype) {
|
||||||
|
// The previous (3.x) code included a check for
|
||||||
|
// available = opal_hwloc_base_get_available_cpus(topo, start)
|
||||||
|
// and skipped objs that had hwloc_bitmap_iszero(available)
|
||||||
|
hwloc_obj_t root;
|
||||||
|
opal_hwloc_topo_data_t *rdata;
|
||||||
|
root = hwloc_get_root_obj(topo);
|
||||||
|
rdata = (opal_hwloc_topo_data_t*)root->userdata;
|
||||||
|
hwloc_cpuset_t constrained_cpuset;
|
||||||
|
|
||||||
|
constrained_cpuset = hwloc_bitmap_alloc();
|
||||||
|
if (rdata && rdata->available) {
|
||||||
|
hwloc_bitmap_and(constrained_cpuset, start->cpuset, rdata->available);
|
||||||
|
} else {
|
||||||
|
hwloc_bitmap_copy(constrained_cpuset, start->cpuset);
|
||||||
|
}
|
||||||
|
|
||||||
unsigned idx = 0;
|
unsigned idx = 0;
|
||||||
if (num_objs)
|
if (num_objs)
|
||||||
*num_objs = hwloc_get_nbobjs_inside_cpuset_by_depth(topo, start->cpuset, search_depth);
|
*num_objs = hwloc_get_nbobjs_inside_cpuset_by_depth(topo, constrained_cpuset, search_depth);
|
||||||
obj = NULL;
|
obj = NULL;
|
||||||
while ((obj = hwloc_get_next_obj_inside_cpuset_by_depth(topo, start->cpuset, search_depth, obj)) != NULL) {
|
while ((obj = hwloc_get_next_obj_inside_cpuset_by_depth(topo, constrained_cpuset, search_depth, obj)) != NULL) {
|
||||||
if (idx == nobj)
|
if (idx == nobj) {
|
||||||
|
hwloc_bitmap_free(constrained_cpuset);
|
||||||
return obj;
|
return obj;
|
||||||
|
}
|
||||||
idx++;
|
idx++;
|
||||||
}
|
}
|
||||||
|
hwloc_bitmap_free(constrained_cpuset);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
* Copyright (c) 2015-2017 Research Organization for Information Science
|
* Copyright (c) 2015-2017 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2018 Inria. All rights reserved.
|
* Copyright (c) 2018 Inria. All rights reserved.
|
||||||
|
* Copyright (c) 2019 IBM Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -168,8 +169,19 @@ static int bind_generic(orte_job_t *jdata,
|
|||||||
trg_obj = NULL;
|
trg_obj = NULL;
|
||||||
min_bound = UINT_MAX;
|
min_bound = UINT_MAX;
|
||||||
while (NULL != (tmp_obj = hwloc_get_next_obj_by_depth(node->topology->topo, target_depth, tmp_obj))) {
|
while (NULL != (tmp_obj = hwloc_get_next_obj_by_depth(node->topology->topo, target_depth, tmp_obj))) {
|
||||||
|
hwloc_obj_t root;
|
||||||
|
opal_hwloc_topo_data_t *rdata;
|
||||||
|
root = hwloc_get_root_obj(node->topology->topo);
|
||||||
|
rdata = (opal_hwloc_topo_data_t*)root->userdata;
|
||||||
|
|
||||||
if (!hwloc_bitmap_intersects(locale->cpuset, tmp_obj->cpuset))
|
if (!hwloc_bitmap_intersects(locale->cpuset, tmp_obj->cpuset))
|
||||||
continue;
|
continue;
|
||||||
|
// From the old 3.x code trg_obj was picked via a call to
|
||||||
|
// opal_hwloc_base_find_min_bound_target_under_obj() which
|
||||||
|
// skiped over unavailable objects (via opal_hwloc_base_get_npus).
|
||||||
|
if (rdata && rdata->available && !hwloc_bitmap_intersects(rdata->available, tmp_obj->cpuset))
|
||||||
|
continue;
|
||||||
|
|
||||||
data = (opal_hwloc_obj_data_t*)tmp_obj->userdata;
|
data = (opal_hwloc_obj_data_t*)tmp_obj->userdata;
|
||||||
if (NULL == data) {
|
if (NULL == data) {
|
||||||
data = OBJ_NEW(opal_hwloc_obj_data_t);
|
data = OBJ_NEW(opal_hwloc_obj_data_t);
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2017 Research Organization for Information Science
|
* Copyright (c) 2017 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
|
* Copyright (c) 2019 IBM Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -377,8 +378,25 @@ static int rank_by(orte_job_t *jdata,
|
|||||||
* Perhaps someday someone will come up with a more efficient
|
* Perhaps someday someone will come up with a more efficient
|
||||||
* algorithm, but this works for now.
|
* algorithm, but this works for now.
|
||||||
*/
|
*/
|
||||||
|
// In 3.x this was two loops:
|
||||||
|
// while (cnt < app->num_procs)
|
||||||
|
// for (i=0; i<num_objs; ...)
|
||||||
|
// Then in 4.x it switched to
|
||||||
|
// while (cnt < app->num_procs && i < (int)node->num_procs)
|
||||||
|
// where that extra i part seems wrong to me. First of all if anything
|
||||||
|
// it seems like it should be i<num_objs since that's the array i is
|
||||||
|
// cycling through, but even then all the usage of i below is
|
||||||
|
// (i % num_objs) so I think i is intended to wrap and you should
|
||||||
|
// keep looping until you've made all the assignments you can for
|
||||||
|
// this node.
|
||||||
|
//
|
||||||
|
// So that's what I added the other loop counter for, figuring if it
|
||||||
|
// cycles through the whole array of objs without making an assignment
|
||||||
|
// it's time for this loop to end and the outer loop to take us to the
|
||||||
|
// next node.
|
||||||
i = 0;
|
i = 0;
|
||||||
while (cnt < app->num_procs && i < (int)node->num_procs) {
|
int niters_of_i_without_assigning_a_proc = 0;
|
||||||
|
while (cnt < app->num_procs && niters_of_i_without_assigning_a_proc <= num_objs) {
|
||||||
/* get the next object */
|
/* get the next object */
|
||||||
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i % num_objs);
|
obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i % num_objs);
|
||||||
if (NULL == obj) {
|
if (NULL == obj) {
|
||||||
@ -446,6 +464,7 @@ static int rank_by(orte_job_t *jdata,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
num_ranked++;
|
num_ranked++;
|
||||||
|
niters_of_i_without_assigning_a_proc = 0;
|
||||||
/* track where the highest vpid landed - this is our
|
/* track where the highest vpid landed - this is our
|
||||||
* new bookmark
|
* new bookmark
|
||||||
*/
|
*/
|
||||||
@ -454,6 +473,7 @@ static int rank_by(orte_job_t *jdata,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
i++;
|
i++;
|
||||||
|
++niters_of_i_without_assigning_a_proc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2019 IBM Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -106,7 +107,8 @@ static int orte_rmaps_rank_file_register(void)
|
|||||||
static int orte_rmaps_rank_file_open(void)
|
static int orte_rmaps_rank_file_open(void)
|
||||||
{
|
{
|
||||||
/* ensure we flag mapping by user */
|
/* ensure we flag mapping by user */
|
||||||
if ((NULL != opal_hwloc_base_cpu_list && !OPAL_BIND_ORDERED_REQUESTED(opal_hwloc_binding_policy)) ||
|
if ((OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) &&
|
||||||
|
!OPAL_BIND_ORDERED_REQUESTED(opal_hwloc_binding_policy)) ||
|
||||||
NULL != orte_rankfile) {
|
NULL != orte_rankfile) {
|
||||||
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
|
||||||
/* if a non-default mapping is already specified, then we
|
/* if a non-default mapping is already specified, then we
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user