1
1

Merge pull request #8199 from rhc54/topic/locality

Fix confusion between cpuset and locality
Этот коммит содержится в:
Ralph Castain 2020-11-11 10:22:03 -08:00 коммит произвёл GitHub
родитель 57ccb830c2 2f7f1feca5
Коммит d489030925
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 18 добавлений и 23 удалений

Просмотреть файл

@ -355,7 +355,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
if (0 < opal_list_get_size(&ilist)) { if (0 < opal_list_get_size(&ilist)) {
uint32_t *peer_ranks = NULL; uint32_t *peer_ranks = NULL;
int prn, nprn = 0; int prn, nprn = 0;
char *val, *mycpuset; char *val;
uint16_t u16; uint16_t u16;
opal_process_name_t wildcard_rank; opal_process_name_t wildcard_rank;
/* convert the list of new procs to a proc_t array */ /* convert the list of new procs to a proc_t array */
@ -380,16 +380,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
opal_argv_free(peers); opal_argv_free(peers);
} }
/* get my locality string */
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
OMPI_PROC_MY_NAME, &val, PMIX_STRING);
if (OPAL_SUCCESS == rc && NULL != val) {
mycpuset = val;
} else {
mycpuset = NULL;
}
i = 0; i = 0;
OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) { OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) {
proc = cd->p; proc = cd->p;
@ -406,8 +396,8 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
val = NULL; val = NULL;
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING,
&proc->super.proc_name, &val, OPAL_STRING); &proc->super.proc_name, &val, OPAL_STRING);
if (OPAL_SUCCESS == rc && NULL != val) { if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) {
u16 = opal_hwloc_compute_relative_locality(mycpuset, val); u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val);
free(val); free(val);
} else { } else {
/* all we can say is that it shares our node */ /* all we can say is that it shares our node */
@ -425,9 +415,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
} }
++i; ++i;
} }
if (NULL != mycpuset) {
free(mycpuset);
}
if (NULL != peer_ranks) { if (NULL != peer_ranks) {
free(peer_ranks); free(peer_ranks);
} }

Просмотреть файл

@ -764,7 +764,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
/* identify our location */ /* identify our location */
val = NULL; val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_CPUSET,
&opal_process_info.my_name, &val, PMIX_STRING); &opal_process_info.my_name, &val, PMIX_STRING);
if (PMIX_SUCCESS == rc && NULL != val) { if (PMIX_SUCCESS == rc && NULL != val) {
opal_process_info.cpuset = val; opal_process_info.cpuset = val;
@ -774,6 +774,15 @@ int ompi_rte_init(int *pargc, char ***pargv)
opal_process_info.cpuset = NULL; opal_process_info.cpuset = NULL;
opal_process_info.proc_is_bound = false; opal_process_info.proc_is_bound = false;
} }
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
&opal_process_info.my_name, &val, PMIX_STRING);
if (PMIX_SUCCESS == rc && NULL != val) {
opal_process_info.locality = val;
val = NULL; // protect the string
} else {
opal_process_info.locality = NULL;
}
/* retrieve the local peers - defaults to local node */ /* retrieve the local peers - defaults to local node */
val = NULL; val = NULL;
@ -811,7 +820,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
&pname, &val, PMIX_STRING); &pname, &val, PMIX_STRING);
if (PMIX_SUCCESS == rc && NULL != val) { if (PMIX_SUCCESS == rc && NULL != val) {
u16 = opal_hwloc_compute_relative_locality(opal_process_info.cpuset, val); u16 = opal_hwloc_compute_relative_locality(opal_process_info.locality, val);
free(val); free(val);
} else { } else {
/* all we can say is that it shares our node */ /* all we can say is that it shares our node */
@ -826,9 +835,6 @@ int ompi_rte_init(int *pargc, char ***pargv)
ret = opal_pmix_convert_status(rc); ret = opal_pmix_convert_status(rc);
error = "local store of locality"; error = "local store of locality";
opal_argv_free(peers); opal_argv_free(peers);
if (NULL != opal_process_info.cpuset) {
free(opal_process_info.cpuset);
}
goto error; goto error;
} }
} }

Просмотреть файл

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2015 Intel, Inc. All rights reserved. * Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights * Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2020 Triad National Security, LLC. All rights * Copyright (c) 2020 Triad National Security, LLC. All rights
@ -345,7 +345,7 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
} }
// compute relative locality // compute relative locality
relative_locality = opal_hwloc_compute_relative_locality(process_info->cpuset, locality_string); relative_locality = opal_hwloc_compute_relative_locality(process_info->locality, locality_string);
free(locality_string); free(locality_string);
if (relative_locality & OPAL_PROC_ON_SOCKET) { if (relative_locality & OPAL_PROC_ON_SOCKET) {

Просмотреть файл

@ -41,6 +41,7 @@ opal_process_info_t opal_process_info = {
.my_local_rank = 0, /* I'm the only process around here */ .my_local_rank = 0, /* I'm the only process around here */
.my_node_rank = 0, .my_node_rank = 0,
.cpuset = NULL, .cpuset = NULL,
.locality = NULL,
.pid = 0, .pid = 0,
.num_procs = 0, .num_procs = 0,
.app_num = 0, .app_num = 0,

Просмотреть файл

@ -115,6 +115,7 @@ typedef struct opal_process_info_t {
uint16_t my_local_rank; /**< local rank on this node within my job */ uint16_t my_local_rank; /**< local rank on this node within my job */
uint16_t my_node_rank; uint16_t my_node_rank;
char *cpuset; /**< String-representation of bitmap where we are bound */ char *cpuset; /**< String-representation of bitmap where we are bound */
char *locality; /**< String-representation of process locality */
pid_t pid; pid_t pid;
uint32_t num_procs; uint32_t num_procs;
uint32_t app_num; uint32_t app_num;