1
1

Fix confusion between cpuset and locality

Ensure we correctly collect and save the cpuset of the process
separately from its locality string. Ensure we use the correct one when
computing things like relative locality between processes.

Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
Ralph Castain 2020-11-10 16:41:00 -08:00
родитель 57ccb830c2
Коммит 2f7f1feca5
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B63B630167D26BB5
5 изменённых файлов: 18 добавлений и 23 удалений

Просмотреть файл

@ -355,7 +355,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
if (0 < opal_list_get_size(&ilist)) {
uint32_t *peer_ranks = NULL;
int prn, nprn = 0;
char *val, *mycpuset;
char *val;
uint16_t u16;
opal_process_name_t wildcard_rank;
/* convert the list of new procs to a proc_t array */
@ -380,16 +380,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
opal_argv_free(peers);
}
/* get my locality string */
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
OMPI_PROC_MY_NAME, &val, PMIX_STRING);
if (OPAL_SUCCESS == rc && NULL != val) {
mycpuset = val;
} else {
mycpuset = NULL;
}
i = 0;
OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) {
proc = cd->p;
@ -406,8 +396,8 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
val = NULL;
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING,
&proc->super.proc_name, &val, OPAL_STRING);
if (OPAL_SUCCESS == rc && NULL != val) {
u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) {
u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val);
free(val);
} else {
/* all we can say is that it shares our node */
@ -425,9 +415,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
}
++i;
}
if (NULL != mycpuset) {
free(mycpuset);
}
if (NULL != peer_ranks) {
free(peer_ranks);
}

Просмотреть файл

@ -764,7 +764,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
/* identify our location */
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_CPUSET,
&opal_process_info.my_name, &val, PMIX_STRING);
if (PMIX_SUCCESS == rc && NULL != val) {
opal_process_info.cpuset = val;
@ -774,6 +774,15 @@ int ompi_rte_init(int *pargc, char ***pargv)
opal_process_info.cpuset = NULL;
opal_process_info.proc_is_bound = false;
}
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
&opal_process_info.my_name, &val, PMIX_STRING);
if (PMIX_SUCCESS == rc && NULL != val) {
opal_process_info.locality = val;
val = NULL; // protect the string
} else {
opal_process_info.locality = NULL;
}
/* retrieve the local peers - defaults to local node */
val = NULL;
@ -811,7 +820,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
&pname, &val, PMIX_STRING);
if (PMIX_SUCCESS == rc && NULL != val) {
u16 = opal_hwloc_compute_relative_locality(opal_process_info.cpuset, val);
u16 = opal_hwloc_compute_relative_locality(opal_process_info.locality, val);
free(val);
} else {
/* all we can say is that it shares our node */
@ -826,9 +835,6 @@ int ompi_rte_init(int *pargc, char ***pargv)
ret = opal_pmix_convert_status(rc);
error = "local store of locality";
opal_argv_free(peers);
if (NULL != opal_process_info.cpuset) {
free(opal_process_info.cpuset);
}
goto error;
}
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2020 Triad National Security, LLC. All rights
@ -345,7 +345,7 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
}
// compute relative locality
relative_locality = opal_hwloc_compute_relative_locality(process_info->cpuset, locality_string);
relative_locality = opal_hwloc_compute_relative_locality(process_info->locality, locality_string);
free(locality_string);
if (relative_locality & OPAL_PROC_ON_SOCKET) {

Просмотреть файл

@ -41,6 +41,7 @@ opal_process_info_t opal_process_info = {
.my_local_rank = 0, /* I'm the only process around here */
.my_node_rank = 0,
.cpuset = NULL,
.locality = NULL,
.pid = 0,
.num_procs = 0,
.app_num = 0,

Просмотреть файл

@ -115,6 +115,7 @@ typedef struct opal_process_info_t {
uint16_t my_local_rank; /**< local rank on this node within my job */
uint16_t my_node_rank;
char *cpuset; /**< String-representation of bitmap where we are bound */
char *locality; /**< String-representation of process locality */
pid_t pid;
uint32_t num_procs;
uint32_t app_num;