Fix confusion between cpuset and locality
Ensure we correctly collect and save the cpuset of the process separately from its locality string. Ensure we use the correct one when computing things like relative locality between processes. Signed-off-by: Ralph Castain <rhc@pmix.org>
Этот коммит содержится в:
родитель
57ccb830c2
Коммит
2f7f1feca5
@ -355,7 +355,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
|
||||
if (0 < opal_list_get_size(&ilist)) {
|
||||
uint32_t *peer_ranks = NULL;
|
||||
int prn, nprn = 0;
|
||||
char *val, *mycpuset;
|
||||
char *val;
|
||||
uint16_t u16;
|
||||
opal_process_name_t wildcard_rank;
|
||||
/* convert the list of new procs to a proc_t array */
|
||||
@ -380,16 +380,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
|
||||
opal_argv_free(peers);
|
||||
}
|
||||
|
||||
/* get my locality string */
|
||||
val = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
|
||||
OMPI_PROC_MY_NAME, &val, PMIX_STRING);
|
||||
if (OPAL_SUCCESS == rc && NULL != val) {
|
||||
mycpuset = val;
|
||||
} else {
|
||||
mycpuset = NULL;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) {
|
||||
proc = cd->p;
|
||||
@ -406,8 +396,8 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
|
||||
val = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING,
|
||||
&proc->super.proc_name, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == rc && NULL != val) {
|
||||
u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
|
||||
if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) {
|
||||
u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val);
|
||||
free(val);
|
||||
} else {
|
||||
/* all we can say is that it shares our node */
|
||||
@ -425,9 +415,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if (NULL != mycpuset) {
|
||||
free(mycpuset);
|
||||
}
|
||||
if (NULL != peer_ranks) {
|
||||
free(peer_ranks);
|
||||
}
|
||||
|
@ -764,7 +764,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
|
||||
/* identify our location */
|
||||
val = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_CPUSET,
|
||||
&opal_process_info.my_name, &val, PMIX_STRING);
|
||||
if (PMIX_SUCCESS == rc && NULL != val) {
|
||||
opal_process_info.cpuset = val;
|
||||
@ -774,6 +774,15 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
opal_process_info.cpuset = NULL;
|
||||
opal_process_info.proc_is_bound = false;
|
||||
}
|
||||
val = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
|
||||
&opal_process_info.my_name, &val, PMIX_STRING);
|
||||
if (PMIX_SUCCESS == rc && NULL != val) {
|
||||
opal_process_info.locality = val;
|
||||
val = NULL; // protect the string
|
||||
} else {
|
||||
opal_process_info.locality = NULL;
|
||||
}
|
||||
|
||||
/* retrieve the local peers - defaults to local node */
|
||||
val = NULL;
|
||||
@ -811,7 +820,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
|
||||
&pname, &val, PMIX_STRING);
|
||||
if (PMIX_SUCCESS == rc && NULL != val) {
|
||||
u16 = opal_hwloc_compute_relative_locality(opal_process_info.cpuset, val);
|
||||
u16 = opal_hwloc_compute_relative_locality(opal_process_info.locality, val);
|
||||
free(val);
|
||||
} else {
|
||||
/* all we can say is that it shares our node */
|
||||
@ -826,9 +835,6 @@ int ompi_rte_init(int *pargc, char ***pargv)
|
||||
ret = opal_pmix_convert_status(rc);
|
||||
error = "local store of locality";
|
||||
opal_argv_free(peers);
|
||||
if (NULL != opal_process_info.cpuset) {
|
||||
free(opal_process_info.cpuset);
|
||||
}
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2020 Triad National Security, LLC. All rights
|
||||
@ -345,7 +345,7 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
|
||||
}
|
||||
|
||||
// compute relative locality
|
||||
relative_locality = opal_hwloc_compute_relative_locality(process_info->cpuset, locality_string);
|
||||
relative_locality = opal_hwloc_compute_relative_locality(process_info->locality, locality_string);
|
||||
free(locality_string);
|
||||
|
||||
if (relative_locality & OPAL_PROC_ON_SOCKET) {
|
||||
|
@ -41,6 +41,7 @@ opal_process_info_t opal_process_info = {
|
||||
.my_local_rank = 0, /* I'm the only process around here */
|
||||
.my_node_rank = 0,
|
||||
.cpuset = NULL,
|
||||
.locality = NULL,
|
||||
.pid = 0,
|
||||
.num_procs = 0,
|
||||
.app_num = 0,
|
||||
|
@ -115,6 +115,7 @@ typedef struct opal_process_info_t {
|
||||
uint16_t my_local_rank; /**< local rank on this node within my job */
|
||||
uint16_t my_node_rank;
|
||||
char *cpuset; /**< String-representation of bitmap where we are bound */
|
||||
char *locality; /**< String-representation of process locality */
|
||||
pid_t pid;
|
||||
uint32_t num_procs;
|
||||
uint32_t app_num;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user