Merge pull request #867 from rolfv/pr/openib-hwloc-verbosity
Add some verbosity to help debug hwloc issues
Этот коммит содержится в:
Коммит
188c30a01a
@ -2358,6 +2358,13 @@ static float get_ib_dev_distance(struct ibv_device *dev)
|
||||
goto out;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"hwloc_distances->nbobjs=%d", hwloc_distances->nbobjs);
|
||||
for (i = 0; i < (int)(2 * hwloc_distances->nbobjs); i++) {
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"hwloc_distances->latency[%d]=%f", i, hwloc_distances->latency[i]);
|
||||
}
|
||||
|
||||
/* If ibv_obj is a NUMA node or below, we're good. */
|
||||
switch (ibv_obj->type) {
|
||||
case HWLOC_OBJ_NODE:
|
||||
@ -2373,6 +2380,7 @@ static float get_ib_dev_distance(struct ibv_device *dev)
|
||||
default:
|
||||
/* If it's above a NUMA node, then I don't know how to compute
|
||||
the distance... */
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output, "ibv_obj->type set to NULL");
|
||||
ibv_obj = NULL;
|
||||
break;
|
||||
}
|
||||
@ -2382,6 +2390,8 @@ static float get_ib_dev_distance(struct ibv_device *dev)
|
||||
goto out;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"ibv_obj->logical_index=%d", ibv_obj->logical_index);
|
||||
/* This function is only called if the process is bound, so let's
|
||||
find out where we are bound to. For the moment, we only care
|
||||
about the NUMA node to which we are bound. */
|
||||
@ -2408,6 +2418,8 @@ static float get_ib_dev_distance(struct ibv_device *dev)
|
||||
my_obj = my_obj->parent;
|
||||
}
|
||||
if (NULL != my_obj) {
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"my_obj->logical_index=%d", my_obj->logical_index);
|
||||
/* Distance may be asymetrical, so calculate both of them
|
||||
and take the max */
|
||||
a = hwloc_distances->latency[my_obj->logical_index +
|
||||
@ -2466,6 +2478,8 @@ sort_devs_by_distance(struct ibv_device **ib_devs, int count)
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
devs[i].ib_dev = ib_devs[i];
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"Checking distance from this process to device=%s", ibv_get_device_name(ib_devs[i]));
|
||||
/* If we're not bound, just assume that the device is close. */
|
||||
devs[i].distance = 0;
|
||||
if (opal_process_info.cpuset) {
|
||||
@ -2473,6 +2487,9 @@ sort_devs_by_distance(struct ibv_device **ib_devs, int count)
|
||||
an accurate distance. */
|
||||
devs[i].distance = get_ib_dev_distance(ib_devs[i]);
|
||||
}
|
||||
opal_output_verbose(5, opal_btl_base_framework.framework_output,
|
||||
"Process is %s: distance to device is %f",
|
||||
(opal_process_info.cpuset ? "bound" : "not bound"), devs[i].distance);
|
||||
}
|
||||
|
||||
qsort(devs, count, sizeof(struct dev_distance), compare_distance);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user