1
1

Merge pull request #4106 from rhc54/topic/hwloc

Add diagnostics for hwloc get_topology
Этот коммит содержится в:
Ralph Castain 2017-08-16 15:47:47 -07:00 коммит произвёл GitHub
родитель cd8db5313e 41df973359
Коммит 1f799afa30
2 изменённых файлов: 25 добавлений и 9 удалений

Просмотреть файл

@ -277,8 +277,8 @@ int opal_hwloc_base_get_topology(void)
char *shmemfile; char *shmemfile;
#endif #endif
OPAL_OUTPUT_VERBOSE((2, opal_hwloc_base_framework.framework_output, opal_output_verbose(2, opal_hwloc_base_framework.framework_output,
"hwloc:base:get_topology")); "hwloc:base:get_topology");
/* see if we already have it */ /* see if we already have it */
if (NULL != opal_hwloc_topology) { if (NULL != opal_hwloc_topology) {
@ -289,8 +289,8 @@ int opal_hwloc_base_get_topology(void)
if (NULL != opal_pmix.get) { if (NULL != opal_pmix.get) {
#if HWLOC_API_VERSION >= 0x20000 #if HWLOC_API_VERSION >= 0x20000
OPAL_OUTPUT_VERBOSE((2, opal_hwloc_base_framework.framework_output, opal_output_verbose(2, opal_hwloc_base_framework.framework_output,
"hwloc:base: looking for topology in shared memory")); "hwloc:base: looking for topology in shared memory");
/* first try to get the shmem link, if available */ /* first try to get the shmem link, if available */
aptr = &addr; aptr = &addr;
@ -304,15 +304,17 @@ int opal_hwloc_base_get_topology(void)
if (OPAL_SUCCESS == rc && OPAL_SUCCESS == rc2 && OPAL_SUCCESS == rc3) { if (OPAL_SUCCESS == rc && OPAL_SUCCESS == rc2 && OPAL_SUCCESS == rc3) {
if (0 > (fd = open(shmemfile, O_RDONLY))) { if (0 > (fd = open(shmemfile, O_RDONLY))) {
free(shmemfile); free(shmemfile);
return OPAL_ERROR; OPAL_ERROR_LOG(OPAL_ERR_FILE_OPEN_FAILURE)
return OPAL_ERR_FILE_OPEN_FAILURE;
} }
free(shmemfile); free(shmemfile);
if (0 != hwloc_shmem_topology_adopt(&opal_hwloc_topology, fd, if (0 != hwloc_shmem_topology_adopt(&opal_hwloc_topology, fd,
0, (void*)addr, size, 0)) { 0, (void*)addr, size, 0)) {
return OPAL_ERROR; OPAL_ERROR_LOG(OPAL_ERR_FILE_READ_FAILURE);
return OPAL_ERR_FILE_READ_FAILURE;
} }
OPAL_OUTPUT_VERBOSE((2, opal_hwloc_base_framework.framework_output, opal_output_verbose(2, opal_hwloc_base_framework.framework_output,
"hwloc:base: topology in shared memory")); "hwloc:base: topology in shared memory");
topo_in_shmem = true; topo_in_shmem = true;
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
@ -320,14 +322,18 @@ int opal_hwloc_base_get_topology(void)
/* if that isn't available, then try to retrieve /* if that isn't available, then try to retrieve
* the xml representation from the PMIx data store */ * the xml representation from the PMIx data store */
opal_output_verbose(1, opal_hwloc_base_framework.framework_output, opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
"hwloc:base instantiating topology"); "hwloc:base getting topology XML string");
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCAL_TOPO, OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCAL_TOPO,
&wildcard_rank, &val, OPAL_STRING); &wildcard_rank, &val, OPAL_STRING);
} else { } else {
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
"hwloc:base PMIx not available");
rc = OPAL_ERR_NOT_SUPPORTED; rc = OPAL_ERR_NOT_SUPPORTED;
} }
if (OPAL_SUCCESS == rc && NULL != val) { if (OPAL_SUCCESS == rc && NULL != val) {
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
"hwloc:base loading topology from XML");
/* load the topology */ /* load the topology */
if (0 != hwloc_topology_init(&opal_hwloc_topology)) { if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
free(val); free(val);
@ -361,9 +367,12 @@ int opal_hwloc_base_get_topology(void)
return rc; return rc;
} }
} else if (NULL == opal_hwloc_base_topo_file) { } else if (NULL == opal_hwloc_base_topo_file) {
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
"hwloc:base discovering topology");
if (0 != hwloc_topology_init(&opal_hwloc_topology) || if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
0 != opal_hwloc_base_topology_set_flags(opal_hwloc_topology, 0, true) || 0 != opal_hwloc_base_topology_set_flags(opal_hwloc_topology, 0, true) ||
0 != hwloc_topology_load(opal_hwloc_topology)) { 0 != hwloc_topology_load(opal_hwloc_topology)) {
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
return OPAL_ERR_NOT_SUPPORTED; return OPAL_ERR_NOT_SUPPORTED;
} }
/* filter the cpus thru any default cpu set */ /* filter the cpus thru any default cpu set */
@ -372,6 +381,9 @@ int opal_hwloc_base_get_topology(void)
return rc; return rc;
} }
} else { } else {
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
"hwloc:base loading topology from file %s",
opal_hwloc_base_topo_file);
if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(opal_hwloc_base_topo_file))) { if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(opal_hwloc_base_topo_file))) {
return rc; return rc;
} }

Просмотреть файл

@ -16,11 +16,15 @@ int main(int argc, char* argv[])
int rank, size, rc; int rank, size, rc;
hwloc_cpuset_t cpus; hwloc_cpuset_t cpus;
char *bindings = NULL; char *bindings = NULL;
pid_t pid;
MPI_Init(&argc, &argv); MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_size(MPI_COMM_WORLD, &size);
pid = getpid();
printf("[%lu] Rank %d: getting topology\n", (unsigned long)pid, rank);
fflush(stdout);
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) { if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
cpus = hwloc_bitmap_alloc(); cpus = hwloc_bitmap_alloc();
rc = hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS); rc = hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS);