Cleanup the detection of process binding during mpi_init. There are several cases that need to be checked:
1. no binding support - indicated by a negative return code from get_cpubind 2. binding supported, but not bound - the bitset returned by get_cpubind is the same as the available cpuset 3. binding supported and bound - bitset from get_cpubind is a subset of available cpuset 4. only one cpu is available - in this case, get_cpubind matches the available cpuset, but we are effectively bound This commit was SVN r25957.
Этот коммит содержится в:
родитель
10f94efbda
Коммит
534d70025f
@ -578,18 +578,26 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
|
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
|
||||||
/* get our node object */
|
/* get our node object */
|
||||||
node = hwloc_get_root_obj(opal_hwloc_topology);
|
node = hwloc_get_root_obj(opal_hwloc_topology);
|
||||||
nodeset = hwloc_bitmap_alloc();
|
nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node);
|
||||||
hwloc_bitmap_and(nodeset, node->online_cpuset, node->allowed_cpuset);
|
/* get our bindings */
|
||||||
/* get our cpuset */
|
|
||||||
cpus = hwloc_bitmap_alloc();
|
cpus = hwloc_bitmap_alloc();
|
||||||
hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS);
|
if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) {
|
||||||
/* we are bound if the two cpusets are not equal */
|
/* we are NOT bound if get_cpubind fails, nor can we be bound - the
|
||||||
if (0 != hwloc_bitmap_compare(cpus, nodeset)) {
|
* environment does not support it
|
||||||
|
*/
|
||||||
|
hwloc_bitmap_free(cpus);
|
||||||
|
goto MOVEON;
|
||||||
|
}
|
||||||
|
/* we are bound if the two cpusets are not equal,
|
||||||
|
* or if there is only ONE cpu available to us
|
||||||
|
*/
|
||||||
|
if (0 != hwloc_bitmap_compare(cpus, nodeset) ||
|
||||||
|
opal_hwloc_base_single_cpu(nodeset) ||
|
||||||
|
opal_hwloc_base_single_cpu(cpus)) {
|
||||||
/* someone external set it - indicate it is set
|
/* someone external set it - indicate it is set
|
||||||
* so that we know
|
* so that we know
|
||||||
*/
|
*/
|
||||||
paffinity_enabled = true;
|
paffinity_enabled = true;
|
||||||
hwloc_bitmap_free(nodeset);
|
|
||||||
hwloc_bitmap_free(cpus);
|
hwloc_bitmap_free(cpus);
|
||||||
} else if (support->cpubind->set_thisproc_cpubind &&
|
} else if (support->cpubind->set_thisproc_cpubind &&
|
||||||
OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
|
OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
|
||||||
@ -602,25 +610,21 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
if (ORTE_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
|
if (ORTE_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
|
||||||
opal_hwloc_topology, cpus))) {
|
opal_hwloc_topology, cpus))) {
|
||||||
error = "Setting processor affinity failed";
|
error = "Setting processor affinity failed";
|
||||||
hwloc_bitmap_free(nodeset);
|
|
||||||
hwloc_bitmap_free(cpus);
|
hwloc_bitmap_free(cpus);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
||||||
error = "Setting processor affinity failed";
|
error = "Setting processor affinity failed";
|
||||||
hwloc_bitmap_free(nodeset);
|
|
||||||
hwloc_bitmap_free(cpus);
|
hwloc_bitmap_free(cpus);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
/* try to find a level and index for this location */
|
/* try to find a level and index for this location */
|
||||||
opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx);
|
opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx);
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
hwloc_bitmap_free(nodeset);
|
|
||||||
hwloc_bitmap_free(cpus);
|
hwloc_bitmap_free(cpus);
|
||||||
paffinity_enabled = true;
|
paffinity_enabled = true;
|
||||||
} else {
|
} else {
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
hwloc_bitmap_free(nodeset);
|
|
||||||
hwloc_bitmap_free(cpus);
|
hwloc_bitmap_free(cpus);
|
||||||
/* get the node rank */
|
/* get the node rank */
|
||||||
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
|
if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
|
||||||
@ -640,15 +644,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
error = "Getting hwthread object";
|
error = "Getting hwthread object";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
cpus = hwloc_bitmap_alloc();
|
cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
|
||||||
hwloc_bitmap_and(cpus, obj->online_cpuset, obj->allowed_cpuset);
|
|
||||||
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
||||||
ret = OMPI_ERROR;
|
ret = OMPI_ERROR;
|
||||||
error = "Setting processor affinity failed";
|
error = "Setting processor affinity failed";
|
||||||
hwloc_bitmap_free(cpus);
|
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
hwloc_bitmap_free(cpus);
|
|
||||||
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
|
orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
|
||||||
orte_process_info.bind_idx = nrank;
|
orte_process_info.bind_idx = nrank;
|
||||||
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
|
||||||
@ -661,15 +662,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
error = "Getting core object";
|
error = "Getting core object";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
cpus = hwloc_bitmap_alloc();
|
cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
|
||||||
hwloc_bitmap_and(cpus, obj->online_cpuset, obj->allowed_cpuset);
|
|
||||||
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
||||||
error = "Setting processor affinity failed";
|
error = "Setting processor affinity failed";
|
||||||
hwloc_bitmap_free(cpus);
|
|
||||||
ret = OMPI_ERROR;
|
ret = OMPI_ERROR;
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
hwloc_bitmap_free(cpus);
|
|
||||||
orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
|
orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
|
||||||
orte_process_info.bind_idx = nrank;
|
orte_process_info.bind_idx = nrank;
|
||||||
} else {
|
} else {
|
||||||
@ -711,15 +709,12 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* this is the place! */
|
/* this is the place! */
|
||||||
cpus = hwloc_bitmap_alloc();
|
cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
|
||||||
hwloc_bitmap_and(cpus, obj->online_cpuset, obj->allowed_cpuset);
|
|
||||||
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
|
||||||
ret = OMPI_ERROR;
|
ret = OMPI_ERROR;
|
||||||
error = "Setting processor affinity failed";
|
error = "Setting processor affinity failed";
|
||||||
hwloc_bitmap_free(cpus);
|
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
hwloc_bitmap_free(cpus);
|
|
||||||
orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
|
orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
|
||||||
obj, OPAL_HWLOC_LOGICAL);
|
obj, OPAL_HWLOC_LOGICAL);
|
||||||
paffinity_enabled = true;
|
paffinity_enabled = true;
|
||||||
@ -760,6 +755,7 @@ MOVEON:
|
|||||||
/* get the root object for this node */
|
/* get the root object for this node */
|
||||||
root = hwloc_get_root_obj(opal_hwloc_topology);
|
root = hwloc_get_root_obj(opal_hwloc_topology);
|
||||||
cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);
|
cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);
|
||||||
|
/* we are not bound if this equals our cpuset */
|
||||||
if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) {
|
if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) {
|
||||||
opal_output(0, "%s is not bound",
|
opal_output(0, "%s is not bound",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
|
@ -154,6 +154,7 @@ OPAL_DECLSPEC unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
|
|||||||
OPAL_DECLSPEC char* opal_hwloc_base_print_binding(opal_binding_policy_t binding);
|
OPAL_DECLSPEC char* opal_hwloc_base_print_binding(opal_binding_policy_t binding);
|
||||||
OPAL_DECLSPEC char* opal_hwloc_base_print_locality(opal_paffinity_locality_t locality);
|
OPAL_DECLSPEC char* opal_hwloc_base_print_locality(opal_paffinity_locality_t locality);
|
||||||
OPAL_DECLSPEC char* opal_hwloc_base_print_level(opal_hwloc_level_t level);
|
OPAL_DECLSPEC char* opal_hwloc_base_print_level(opal_hwloc_level_t level);
|
||||||
|
OPAL_DECLSPEC bool opal_hwloc_base_single_cpu(hwloc_cpuset_t cpuset);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provide a utility to parse a slot list against the local
|
* Provide a utility to parse a slot list against the local
|
||||||
|
@ -226,20 +226,18 @@ void opal_hwloc_base_free_topology(hwloc_topology_t topo)
|
|||||||
void opal_hwloc_base_get_local_cpuset(void)
|
void opal_hwloc_base_get_local_cpuset(void)
|
||||||
{
|
{
|
||||||
hwloc_obj_t root;
|
hwloc_obj_t root;
|
||||||
|
hwloc_cpuset_t base_cpus;
|
||||||
|
|
||||||
if (NULL != opal_hwloc_topology) {
|
if (NULL != opal_hwloc_topology) {
|
||||||
if (NULL == opal_hwloc_my_cpuset) {
|
if (NULL == opal_hwloc_my_cpuset) {
|
||||||
opal_hwloc_my_cpuset = hwloc_bitmap_alloc();
|
opal_hwloc_my_cpuset = hwloc_bitmap_alloc();
|
||||||
}
|
}
|
||||||
/* get the cpus we are bound to */
|
/* get the cpus we are bound to */
|
||||||
hwloc_get_cpubind(opal_hwloc_topology, opal_hwloc_my_cpuset, HWLOC_CPUBIND_PROCESS);
|
if (0 > hwloc_get_cpubind(opal_hwloc_topology, opal_hwloc_my_cpuset, HWLOC_CPUBIND_PROCESS)) {
|
||||||
/* if the cpuset is empty, then we are not bound */
|
/* we are not bound - use the root's available cpuset */
|
||||||
if (hwloc_bitmap_iszero(opal_hwloc_my_cpuset)) {
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_output,
|
|
||||||
"hwloc:base:get_local_cpuset MY LOCAL CPUSET WAS ZERO - NOT BOUND"));
|
|
||||||
/* just insert the cpuset for the root object as we are unbound */
|
|
||||||
root = hwloc_get_root_obj(opal_hwloc_topology);
|
root = hwloc_get_root_obj(opal_hwloc_topology);
|
||||||
hwloc_bitmap_copy(opal_hwloc_my_cpuset, root->cpuset);
|
base_cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);
|
||||||
|
hwloc_bitmap_copy(opal_hwloc_my_cpuset, base_cpus);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -345,6 +343,34 @@ static void df_search_cores(hwloc_obj_t obj, unsigned int *cnt)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* determine if there is a single cpu in a bitmap */
|
||||||
|
bool opal_hwloc_base_single_cpu(hwloc_cpuset_t cpuset)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
bool one=false;
|
||||||
|
|
||||||
|
/* count the number of bits that are set - there is
|
||||||
|
* one bit for each available pu. We could just
|
||||||
|
* subtract the first and last indices, but there
|
||||||
|
* may be "holes" in the bitmap corresponding to
|
||||||
|
* offline or unallowed cpus - so we have to
|
||||||
|
* search for them. Return false if we anything
|
||||||
|
* other than one
|
||||||
|
*/
|
||||||
|
for (i=hwloc_bitmap_first(cpuset);
|
||||||
|
i <= hwloc_bitmap_last(cpuset);
|
||||||
|
i++) {
|
||||||
|
if (hwloc_bitmap_isset(cpuset, i)) {
|
||||||
|
if (one) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
one = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return one;
|
||||||
|
}
|
||||||
|
|
||||||
/* get the number of pu's under a given hwloc object */
|
/* get the number of pu's under a given hwloc object */
|
||||||
unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
|
unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
|
||||||
hwloc_obj_t obj)
|
hwloc_obj_t obj)
|
||||||
|
@ -6,13 +6,13 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include "opal/mca/hwloc/hwloc.h"
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
|
|
||||||
#include "opal/mca/hwloc/hwloc.h"
|
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
int rank, size;
|
int rank, size, rc;
|
||||||
hwloc_cpuset_t cpus;
|
hwloc_cpuset_t cpus;
|
||||||
char *bindings;
|
char *bindings;
|
||||||
|
|
||||||
@ -21,10 +21,11 @@ int main(int argc, char* argv[])
|
|||||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||||
|
|
||||||
cpus = hwloc_bitmap_alloc();
|
cpus = hwloc_bitmap_alloc();
|
||||||
hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS);
|
rc = hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS);
|
||||||
hwloc_bitmap_list_asprintf(&bindings, cpus);
|
hwloc_bitmap_list_asprintf(&bindings, cpus);
|
||||||
|
|
||||||
printf("Hello, World, I am %d of %d: bitmap %s\n", rank, size, bindings);
|
printf("Hello, World, I am %d of %d: rc %d bitmap %s\n", rank, size, rc,
|
||||||
|
(NULL == bindings) ? "NULL" : bindings);
|
||||||
|
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
return 0;
|
return 0;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user