1
1

Resolve the problem of binding in inverted topologies - check the relative depth of the map and bind objects in the topology, and let that determine whether we bind downward or upwards.

cmr=v1.7.5:reviewer=jsquyres:subject=Resolve the problem of binding in inverted topologies

This commit was SVN r30643.
Этот коммит содержится в:
Ralph Castain 2014-02-09 05:30:17 +00:00
родитель 0ee38353ba
Коммит ca0c806662
3 изменённых файлов: 385 добавлений и 508 удалений

Просмотреть файл

@ -177,7 +177,9 @@ typedef uint16_t opal_binding_policy_t;
#define OPAL_BIND_IF_SUPPORTED 0x1000
#define OPAL_BIND_ALLOW_OVERLOAD 0x2000
#define OPAL_BIND_GIVEN 0x4000
/* binding policies */
/* binding policies - any changes in these
* values must be reflected in orte/mca/rmaps/rmaps.h
*/
#define OPAL_BIND_TO_NONE 1
#define OPAL_BIND_TO_BOARD 2
#define OPAL_BIND_TO_NUMA 3

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -102,6 +102,7 @@ static void reset_usage(orte_node_t *node, orte_jobid_t jobid)
}
static int bind_upwards(orte_job_t *jdata,
orte_node_t *node,
hwloc_obj_type_t target,
unsigned cache_level)
{
@ -109,14 +110,12 @@ static int bind_upwards(orte_job_t *jdata,
* until we find an object of type target - and then bind
* the process to that target
*/
int i, j;
int j;
orte_job_map_t *map;
orte_node_t *node;
orte_proc_t *proc;
hwloc_obj_t obj;
hwloc_cpuset_t cpus;
unsigned int idx, ncpus;
struct hwloc_topology_support *support;
opal_hwloc_obj_data_t *data;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
@ -126,69 +125,6 @@ static int bind_upwards(orte_job_t *jdata,
/* initialize */
map = jdata->map;
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
if (!orte_do_not_launch) {
/* if we don't want to launch, then we are just testing the system,
* so ignore questions about support capabilities
*/
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
/* check if topology supports cpubind - have to be careful here
* as Linux doesn't currently support thread-level binding. This
* may change in the future, though, and it isn't clear how hwloc
* interprets the current behavior. So check both flags to be sure.
*/
if (!support->cpubind->set_thisproc_cpubind &&
!support->cpubind->set_thisthread_cpubind) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
/* we are not required to bind, so ignore this */
continue;
}
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
return ORTE_ERR_SILENT;
}
/* check if topology supports membind - have to be careful here
* as hwloc treats this differently than I (at least) would have
* expected. Per hwloc, Linux memory binding is at the thread,
* and not process, level. Thus, hwloc sets the "thisproc" flag
* to "false" on all Linux systems, and uses the "thisthread" flag
* to indicate binding capability - don't warn if the user didn't
* specifically request binding
*/
if (!support->membind->set_thisproc_membind &&
!support->membind->set_thisthread_membind &&
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
membind_warned = true;
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
return ORTE_ERR_SILENT;
}
}
}
/* some systems do not report cores, and so we can get a situation where our
* default binding policy will fail for no necessary reason. So if we are
* computing a binding due to our default policy, and no cores are found
* on this node, just silently skip it - we will not bind
*/
if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) &&
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"Unable to bind-to core by default on node %s as no cores detected",
node->name);
continue;
}
/* we share topologies in order
* to save space, so we need to reset the usage info to reflect
* our own current state
*/
reset_usage(node, jdata->jobid);
/* cycle thru the procs */
for (j=0; j < node->procs->size; j++) {
@ -272,23 +208,21 @@ static int bind_upwards(orte_job_t *jdata,
return ORTE_ERR_SILENT;
}
}
}
return ORTE_SUCCESS;
}
static int bind_downwards(orte_job_t *jdata,
orte_node_t *node,
hwloc_obj_type_t target,
unsigned cache_level)
{
int i, j;
int j;
orte_job_map_t *map;
orte_node_t *node;
orte_proc_t *proc;
hwloc_obj_t trg_obj, nxt_obj;
hwloc_cpuset_t cpus;
unsigned int ncpus;
struct hwloc_topology_support *support;
opal_hwloc_obj_data_t *data;
int total_cpus;
hwloc_cpuset_t totalcpuset;
@ -301,72 +235,6 @@ static int bind_downwards(orte_job_t *jdata,
map = jdata->map;
totalcpuset = hwloc_bitmap_alloc();
for (i=0; i < map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
if (!orte_do_not_launch) {
/* if we don't want to launch, then we are just testing the system,
* so ignore questions about support capabilities
*/
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
/* check if topology supports cpubind - have to be careful here
* as Linux doesn't currently support thread-level binding. This
* may change in the future, though, and it isn't clear how hwloc
* interprets the current behavior. So check both flags to be sure.
*/
if (!support->cpubind->set_thisproc_cpubind &&
!support->cpubind->set_thisthread_cpubind) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
/* we are not required to bind, so ignore this */
continue;
}
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;
}
/* check if topology supports membind - have to be careful here
* as hwloc treats this differently than I (at least) would have
* expected. Per hwloc, Linux memory binding is at the thread,
* and not process, level. Thus, hwloc sets the "thisproc" flag
* to "false" on all Linux systems, and uses the "thisthread" flag
* to indicate binding capability - don't warn if the user didn't
* specifically request binding
*/
if (!support->membind->set_thisproc_membind &&
!support->membind->set_thisthread_membind &&
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
membind_warned = true;
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;
}
}
}
/* some systems do not report cores, and so we can get a situation where our
* default binding policy will fail for no necessary reason. So if we are
* computing a binding due to our default policy, and no cores are found
* on this node, just silently skip it - we will not bind
*/
if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) &&
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"Unable to bind-to core by default on node %s as no cores detected",
node->name);
continue;
}
/* we share topologies in order
* to save space, so we need to reset the usage info to reflect
* our own current state
*/
reset_usage(node, jdata->jobid);
/* cycle thru the procs */
for (j=0; j < node->procs->size; j++) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
@ -468,7 +336,6 @@ static int bind_downwards(orte_job_t *jdata,
}
}
}
}
hwloc_bitmap_free(totalcpuset);
return ORTE_SUCCESS;
@ -715,17 +582,31 @@ static int bind_to_cpuset(orte_job_t *jdata)
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
{
hwloc_obj_type_t hwb, hwm;
unsigned clvl=0;
opal_binding_policy_t bind;
orte_mapping_policy_t map;
orte_node_t *node;
int i, rc;
struct hwloc_topology_support *support;
bool force_down = false;
hwloc_cpuset_t totalcpuset;
int bind_depth, map_depth;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: compute bindings for job %s with policy %s",
ORTE_JOBID_PRINT(jdata->jobid),
opal_hwloc_base_print_binding(jdata->map->binding));
if (ORTE_MAPPING_BYUSER == ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping)) {
map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping);
bind = OPAL_GET_BINDING_POLICY(jdata->map->binding);
if (ORTE_MAPPING_BYUSER == map) {
/* user specified binding by rankfile - nothing for us to do */
return ORTE_SUCCESS;
}
if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (OPAL_BIND_TO_CPUSET == bind) {
int rc;
/* cpuset was given - setup the bindings */
if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) {
@ -734,17 +615,76 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
return rc;
}
if (OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (OPAL_BIND_TO_NONE == bind) {
/* no binding requested */
return ORTE_SUCCESS;
}
if (OPAL_BIND_TO_BOARD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (OPAL_BIND_TO_BOARD == bind) {
/* doesn't do anything at this time */
return ORTE_SUCCESS;
}
/* binding requested */
/* binding requested - convert the binding level to the hwloc obj type */
switch (bind) {
case OPAL_BIND_TO_NUMA:
hwb = HWLOC_OBJ_NODE;
break;
case OPAL_BIND_TO_SOCKET:
hwb = HWLOC_OBJ_SOCKET;
break;
case OPAL_BIND_TO_L3CACHE:
hwb = HWLOC_OBJ_CACHE;
clvl = 3;
break;
case OPAL_BIND_TO_L2CACHE:
hwb = HWLOC_OBJ_CACHE;
clvl = 2;
break;
case OPAL_BIND_TO_L1CACHE:
hwb = HWLOC_OBJ_CACHE;
clvl = 1;
break;
case OPAL_BIND_TO_CORE:
hwb = HWLOC_OBJ_CORE;
break;
case OPAL_BIND_TO_HWTHREAD:
hwb = HWLOC_OBJ_PU;
break;
default:
return ORTE_ERR_BAD_PARAM;
}
/* do the same for the mapping policy */
switch (map) {
case ORTE_MAPPING_BYNUMA:
hwm = HWLOC_OBJ_NODE;
break;
case ORTE_MAPPING_BYSOCKET:
hwm = HWLOC_OBJ_SOCKET;
break;
case ORTE_MAPPING_BYL3CACHE:
hwm = HWLOC_OBJ_CACHE;
clvl = 3;
break;
case ORTE_MAPPING_BYL2CACHE:
hwm = HWLOC_OBJ_CACHE;
clvl = 2;
break;
case ORTE_MAPPING_BYL1CACHE:
hwm = HWLOC_OBJ_CACHE;
clvl = 1;
break;
case ORTE_MAPPING_BYCORE:
hwm = HWLOC_OBJ_CORE;
break;
case ORTE_MAPPING_BYHWTHREAD:
hwm = HWLOC_OBJ_PU;
break;
default:
return ORTE_ERR_BAD_PARAM;
}
/* if the job was mapped by the corresponding target, then
* we bind in place
*
@ -758,41 +698,19 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
* procs to the resources below.
*/
if (ORTE_MAPPING_BYDIST == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_MAPPING_BYDIST == map) {
int rc = ORTE_SUCCESS;
if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (OPAL_BIND_TO_NUMA == bind) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bindings for job %s - dist to numa",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
} else if (OPAL_BIND_TO_NUMA < OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) {
ORTE_ERROR_LOG(rc);
}
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
} else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
}
} else if (OPAL_BIND_TO_NUMA < bind) {
/* bind every proc downwards */
force_down = true;
goto execute;
}
/* if the binding policy is less than numa, then we are unbound - so
* just ignore this and return (should have been caught in prior
@ -800,176 +718,127 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
* and "board")
*/
return rc;
} else if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
}
/* now deal with the remaining binding policies based on hardware */
if (bind == map) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bindings for job %s - hwthread to hwthread",
"mca:rmaps: bindings for job %s - bind in place",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_PU, 0))) {
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* HW threads are at the bottom, so we must have mapped somewhere
* above this level - and we need to bind downwards
*/
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bindings for job %s - core to core",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy used is less than bycore, then it is a
* downward binding - i.e., the locale is above the binding location.
* for example, if we map-to-socket and bind-to-core, then we compare
* the mapping value of ORTE_MAPPING_BYCORE to ORTE_MAPPING_BYSOCKET.
* In this case, BYCORE > BYSOCKET, so we know that the locale is
* above the desired binding level (sockets are at a higher level than
* the desired core binding level), and we will have to bind downwards
*/
if (ORTE_MAPPING_BYCORE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CORE, 0))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bindings for job %s - L1cache to L1cache",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than l1cache, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYL1CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 1))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bindings for job %s - L2cache to L2cache",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than l2cache, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYL2CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 2))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bindings for job %s - L3cache to L3cache",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than l3cache, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYL3CACHE > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_CACHE, 3))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bindings for job %s - socket to socket",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than bysocket, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYSOCKET > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
} else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
int rc;
if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bindings for job %s - numa to numa",
ORTE_JOBID_PRINT(jdata->jobid));
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* if the mapping policy is less than numa, then it is a
* downward binding
*/
if (ORTE_MAPPING_BYNUMA > ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, HWLOC_OBJ_NODE, 0))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
}
/* if we get here, then the binding policy isn't supported */
ORTE_ERROR_LOG(ORTE_ERR_NOT_SUPPORTED);
return ORTE_ERR_NOT_SUPPORTED;
/* we need to handle the remaining binding options on a per-node
* basis because different nodes could potentially have different
* topologies, with different relative depths for the two levels
*/
execute:
/* initialize */
totalcpuset = hwloc_bitmap_alloc();
for (i=0; i < jdata->map->nodes->size; i++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
continue;
}
if (!orte_do_not_launch) {
/* if we don't want to launch, then we are just testing the system,
* so ignore questions about support capabilities
*/
support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
/* check if topology supports cpubind - have to be careful here
* as Linux doesn't currently support thread-level binding. This
* may change in the future, though, and it isn't clear how hwloc
* interprets the current behavior. So check both flags to be sure.
*/
if (!support->cpubind->set_thisproc_cpubind &&
!support->cpubind->set_thisthread_cpubind) {
if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) ||
!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
/* we are not required to bind, so ignore this */
continue;
}
orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;
}
/* check if topology supports membind - have to be careful here
* as hwloc treats this differently than I (at least) would have
* expected. Per hwloc, Linux memory binding is at the thread,
* and not process, level. Thus, hwloc sets the "thisproc" flag
* to "false" on all Linux systems, and uses the "thisthread" flag
* to indicate binding capability - don't warn if the user didn't
* specifically request binding
*/
if (!support->membind->set_thisproc_membind &&
!support->membind->set_thisthread_membind &&
(OPAL_BIND_GIVEN & opal_hwloc_binding_policy)) {
if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
membind_warned = true;
} else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;
}
}
}
/* some systems do not report cores, and so we can get a situation where our
* default binding policy will fail for no necessary reason. So if we are
* computing a binding due to our default policy, and no cores are found
* on this node, just silently skip it - we will not bind
*/
if (!(OPAL_BIND_GIVEN & opal_hwloc_binding_policy) &&
HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"Unable to bind-to core by default on node %s as no cores detected",
node->name);
continue;
}
/* we share topologies in order
* to save space, so we need to reset the usage info to reflect
* our own current state
*/
reset_usage(node, jdata->jobid);
if (force_down) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
/* determine the relative depth on this node */
bind_depth = hwloc_get_type_depth(node->topology, hwb);
if (0 > bind_depth) {
/* didn't find such an object */
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
true, hwloc_obj_type_string(hwb), node->name);
return ORTE_ERR_SILENT;
}
map_depth = hwloc_get_type_depth(node->topology, hwm);
if (0 > map_depth) {
/* didn't find such an object */
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
true, hwloc_obj_type_string(hwm), node->name);
return ORTE_ERR_SILENT;
}
if (bind_depth > map_depth) {
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
ORTE_ERROR_LOG(rc);
return rc;
}
} else {
if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -100,19 +100,25 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_map_t);
((pol) & 0xff00)
/* round-robin policies */
#define ORTE_MAPPING_BYSLOT 1
#define ORTE_MAPPING_BYNODE 2
#define ORTE_MAPPING_BYBOARD 3
#define ORTE_MAPPING_BYNUMA 4
#define ORTE_MAPPING_BYSOCKET 5
#define ORTE_MAPPING_BYL3CACHE 6
#define ORTE_MAPPING_BYL2CACHE 7
#define ORTE_MAPPING_BYL1CACHE 8
#define ORTE_MAPPING_BYCORE 9
#define ORTE_MAPPING_BYHWTHREAD 10
/* start with hardware-based options
* so the values match the corresponding
* levels in opal/mca/hwloc/hwloc.h
*/
#define ORTE_MAPPING_BYNODE 1
#define ORTE_MAPPING_BYBOARD 2
#define ORTE_MAPPING_BYNUMA 3
#define ORTE_MAPPING_BYSOCKET 4
#define ORTE_MAPPING_BYL3CACHE 5
#define ORTE_MAPPING_BYL2CACHE 6
#define ORTE_MAPPING_BYL1CACHE 7
#define ORTE_MAPPING_BYCORE 8
#define ORTE_MAPPING_BYHWTHREAD 9
/* now take the other round-robin options */
#define ORTE_MAPPING_BYSLOT 10
#define ORTE_MAPPING_BYDIST 11
/* convenience - declare anything <= 15 to be round-robin*/
#define ORTE_MAPPING_RR 0x000f
/* sequential policy */
#define ORTE_MAPPING_SEQ 20
/* staged execution mapping */