Fix cpu-list for non-uniform nodes
* The `--cpu-list` argument restricts the hwloc topology. When that topology is sent from the remote daemon back to the HNP it is packed as XML. This packing results in the loss of the applied cpu-list. Mapping will then be using the full topology instead of the restricted topology when mapping the processes. When the launch command reaches the remote node it will not be congruent with the remote node's view of the topology leading to a launch failure especially if the HNP and remote node have differing topologies. - The solution is to make sure that the HNP re-applies the cpu-list to the incomming topology from the remote node. This way the HNP and the remote node are using the same restricted topology. Signed-off-by: Joshua Hursey <jhursey@us.ibm.com>
Этот коммит содержится в:
родитель
94384991f5
Коммит
7c67fb36b3
@ -16,7 +16,7 @@
|
||||
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2016 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2016-2020 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -902,6 +902,8 @@ void orte_plm_base_daemon_topology(int status, orte_process_name_t* sender,
|
||||
orted_failed_launch = true;
|
||||
goto CLEANUP;
|
||||
}
|
||||
/* Apply any CPU filters (not preserved by the XML) */
|
||||
opal_hwloc_base_filter_cpus(topo);
|
||||
/* record the final topology */
|
||||
t->topo = topo;
|
||||
|
||||
@ -1252,6 +1254,8 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
|
||||
opal_pointer_array_add(orte_node_topologies, t);
|
||||
daemon->node->topology = t;
|
||||
if (NULL != topo) {
|
||||
/* Apply any CPU filters (not preserved by the XML) */
|
||||
opal_hwloc_base_filter_cpus(topo);
|
||||
t->topo = topo;
|
||||
} else {
|
||||
/* nope - save the signature and request the complete topology from that node */
|
||||
|
@ -13,6 +13,7 @@
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2020 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -120,6 +121,9 @@ int orte_rmaps_rr_assign_byobj(orte_job_t *jdata,
|
||||
/* get the number of objects of this type on this node */
|
||||
nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE);
|
||||
if (0 == nobjs) {
|
||||
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:rr: found NO %s objects on node %s",
|
||||
hwloc_obj_type_string(target), node->name);
|
||||
continue;
|
||||
}
|
||||
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user