Correct computation of relative locality
Ensure we always pass the cpuset as well as the locality string for each proc. Correct the mtl/ofi component's computation of relative locality as the function being called expects to be given the locality string of each proc, not the cpuset. If the locality string of the current proc isn't available, then use the cpuset if available and compute the locality before trying to compute relative localities of our peers. Signed-off-by: Ralph Castain <rhc@pmix.org>
This commit is contained in:
parent
3f863aab8a
commit
ec3589389a
1
.gitignore
vendored
1
.gitignore
vendored
@ -396,6 +396,7 @@ opal/mca/pmix/pmix*/pmix/examples/jctrl
|
||||
opal/mca/pmix/pmix*/pmix/examples/pub
|
||||
opal/mca/pmix/pmix*/pmix/examples/server
|
||||
opal/mca/pmix/pmix*/pmix/examples/tool
|
||||
opal/mca/pmix/pmix*/pmix/maint/pmix.pc
|
||||
|
||||
opal/mca/pmix/ext3x/ext3x.c
|
||||
opal/mca/pmix/ext3x/ext3x.h
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2020 Triad National Security, LLC. All rights
|
||||
@ -308,6 +308,7 @@ static uint32_t get_package_rank(int32_t num_local_peers, uint16_t my_local_rank
|
||||
char **peers = NULL;
|
||||
char *local_peers = NULL;
|
||||
char *locality_string = NULL;
|
||||
char *mylocality = NULL;
|
||||
|
||||
pname.jobid = OPAL_PROC_MY_NAME.jobid;
|
||||
pname.vpid = OPAL_VPID_WILDCARD;
|
||||
@ -333,6 +334,20 @@ static uint32_t get_package_rank(int32_t num_local_peers, uint16_t my_local_rank
|
||||
peers = opal_argv_split(local_peers, ',');
|
||||
free(local_peers);
|
||||
|
||||
// Get my locality
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
|
||||
&OPAL_PROC_MY_NAME, &mylocality, OPAL_STRING);
|
||||
if (OPAL_SUCCESS != rc || NULL == mylocality) {
|
||||
// can we fall back to cpuset?
|
||||
if (NULL != cpuset && NULL != opal_hwloc_topology) {
|
||||
mylocality = opal_hwloc_base_get_locality_string(opal_hwloc_topology, cpuset);
|
||||
} else {
|
||||
// We can't find package_rank, fall back to procid
|
||||
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
|
||||
return pid;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; NULL != peers[i]; i++) {
|
||||
pname.vpid = strtoul(peers[i], NULL, 10);
|
||||
locality_string = NULL;
|
||||
@ -346,7 +361,7 @@ static uint32_t get_package_rank(int32_t num_local_peers, uint16_t my_local_rank
|
||||
}
|
||||
|
||||
// compute relative locality
|
||||
relative_locality = opal_hwloc_compute_relative_locality(cpuset, locality_string);
|
||||
relative_locality = opal_hwloc_compute_relative_locality(mylocality, locality_string);
|
||||
free(locality_string);
|
||||
|
||||
if (relative_locality & OPAL_PROC_ON_SOCKET) {
|
||||
@ -354,6 +369,7 @@ static uint32_t get_package_rank(int32_t num_local_peers, uint16_t my_local_rank
|
||||
current_package_rank++;
|
||||
}
|
||||
}
|
||||
free(mylocality);
|
||||
|
||||
return (uint32_t)package_ranks[my_local_rank];
|
||||
}
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -54,6 +54,7 @@ int orte_ess_base_proc_binding(void)
|
||||
int ret;
|
||||
char *error=NULL;
|
||||
hwloc_cpuset_t mycpus;
|
||||
opal_value_t val;
|
||||
|
||||
/* Determine if we were pre-bound or not - this also indicates
|
||||
* that we were launched via mpirun, bound or not */
|
||||
@ -66,23 +67,39 @@ int orte_ess_base_proc_binding(void)
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
|
||||
/* print out a shorthand notation to avoid pulling in the entire topology tree */
|
||||
map = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
|
||||
ORTE_PROC_MY_NAME, &map, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != map) {
|
||||
/* get our cpuset */
|
||||
if (NULL != orte_process_info.cpuset) {
|
||||
free(orte_process_info.cpuset);
|
||||
orte_process_info.cpuset = NULL;
|
||||
}
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_CPUSET,
|
||||
ORTE_PROC_MY_NAME, &orte_process_info.cpuset, OPAL_STRING);
|
||||
/* try to get our locality as well */
|
||||
map = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
|
||||
ORTE_PROC_MY_NAME, &map, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != map) {
|
||||
/* we were - no need to pull in the topology */
|
||||
if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
|
||||
opal_output(0, "MCW rank %s bound to %s",
|
||||
ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map);
|
||||
free(map);
|
||||
} else {
|
||||
opal_output(0, "MCW rank %s not bound", ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid));
|
||||
}
|
||||
free(map);
|
||||
} else {
|
||||
opal_output(0, "MCW rank %s not bound", ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid));
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
} else if (NULL != getenv(OPAL_MCA_PREFIX"orte_externally_bound")) {
|
||||
orte_proc_is_bound = true;
|
||||
/* see if we were launched by a PMIx-enabled system */
|
||||
/* get our cpuset, if available */
|
||||
if (NULL != orte_process_info.cpuset) {
|
||||
free(orte_process_info.cpuset);
|
||||
orte_process_info.cpuset = NULL;
|
||||
}
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_CPUSET,
|
||||
ORTE_PROC_MY_NAME, &orte_process_info.cpuset, OPAL_STRING);
|
||||
|
||||
/* see if we also have our locality - this is the one we require */
|
||||
map = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
|
||||
ORTE_PROC_MY_NAME, &map, OPAL_STRING);
|
||||
@ -323,6 +340,17 @@ int orte_ess_base_proc_binding(void)
|
||||
if (NULL != orte_process_info.cpuset) {
|
||||
OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_CPUSET,
|
||||
orte_process_info.cpuset, OPAL_STRING);
|
||||
/* save our locality string so we can retrieve it elsewhere */
|
||||
OBJ_CONSTRUCT(&val, opal_value_t);
|
||||
val.key = OPAL_PMIX_LOCALITY_STRING;
|
||||
val.type = OPAL_STRING;
|
||||
val.data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, orte_process_info.cpuset);
|
||||
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
val.key = NULL;
|
||||
val.data.string = NULL;
|
||||
OBJ_DESTRUCT(&val);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2009-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Mellanox Technologies, Inc.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Research Organization for Information Science
|
||||
@ -386,7 +386,12 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata, bool force)
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, tmp);
|
||||
opal_list_append(pmap, &kv->super);
|
||||
free(tmp);
|
||||
/* pass the cpuset itself as well */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_CPUSET);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = tmp;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
} else {
|
||||
/* the proc is not bound */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
|
Loading…
Reference in New Issue
Block a user