From ec3589389a2e33731eac7c576be0dbc44bcc6469 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 10 Nov 2020 00:12:22 -0800 Subject: [PATCH] Correct computation of relative locality Ensure we always pass the cpuset as well as the locality string for each proc. Correct the mtl/ofi component's computation of relative locality as the function being called expects to be given the locality string of each proc, not the cpuset. If the locality string of the current proc isn't available, then use the cpuset if available and compute the locality before trying to compute relative localities of our peers. Signed-off-by: Ralph Castain --- .gitignore | 1 + opal/mca/common/ofi/common_ofi.c | 20 ++++++++- orte/mca/ess/base/ess_base_fns.c | 50 +++++++++++++++++----- orte/orted/pmix/pmix_server_register_fns.c | 9 +++- 4 files changed, 65 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index ff8e303f9d..baf4cafb2a 100644 --- a/.gitignore +++ b/.gitignore @@ -396,6 +396,7 @@ opal/mca/pmix/pmix*/pmix/examples/jctrl opal/mca/pmix/pmix*/pmix/examples/pub opal/mca/pmix/pmix*/pmix/examples/server opal/mca/pmix/pmix*/pmix/examples/tool +opal/mca/pmix/pmix*/pmix/maint/pmix.pc opal/mca/pmix/ext3x/ext3x.c opal/mca/pmix/ext3x/ext3x.h diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 9b4631856c..593d6e5135 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2020 Intel, Inc. All rights reserved. * Copyright (c) 2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2020 Triad National Security, LLC. All rights @@ -308,6 +308,7 @@ static uint32_t get_package_rank(int32_t num_local_peers, uint16_t my_local_rank char **peers = NULL; char *local_peers = NULL; char *locality_string = NULL; + char *mylocality = NULL; pname.jobid = OPAL_PROC_MY_NAME.jobid; pname.vpid = OPAL_VPID_WILDCARD; @@ -333,6 +334,20 @@ static uint32_t get_package_rank(int32_t num_local_peers, uint16_t my_local_rank peers = opal_argv_split(local_peers, ','); free(local_peers); + // Get my locality + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING, + &OPAL_PROC_MY_NAME, &mylocality, OPAL_STRING); + if (OPAL_SUCCESS != rc || NULL == mylocality) { + // can we fall back to cpuset? + if (NULL != cpuset && NULL != opal_hwloc_topology) { + mylocality = opal_hwloc_base_get_locality_string(opal_hwloc_topology, cpuset); + } else { + // We can't find package_rank, fall back to procid + opal_show_help("help-common-ofi.txt", "package_rank failed", true); + return pid; + } + } + for (i = 0; NULL != peers[i]; i++) { pname.vpid = strtoul(peers[i], NULL, 10); locality_string = NULL; @@ -346,7 +361,7 @@ static uint32_t get_package_rank(int32_t num_local_peers, uint16_t my_local_rank } // compute relative locality - relative_locality = opal_hwloc_compute_relative_locality(cpuset, locality_string); + relative_locality = opal_hwloc_compute_relative_locality(mylocality, locality_string); free(locality_string); if (relative_locality & OPAL_PROC_ON_SOCKET) { @@ -354,6 +369,7 @@ static uint32_t get_package_rank(int32_t num_local_peers, uint16_t my_local_rank current_package_rank++; } } + free(mylocality); return (uint32_t)package_ranks[my_local_rank]; } diff --git a/orte/mca/ess/base/ess_base_fns.c b/orte/mca/ess/base/ess_base_fns.c index 9b57519e80..fc97a4b499 100644 --- a/orte/mca/ess/base/ess_base_fns.c +++ b/orte/mca/ess/base/ess_base_fns.c @@ -12,7 +12,7 @@ * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -54,6 +54,7 @@ int orte_ess_base_proc_binding(void) int ret; char *error=NULL; hwloc_cpuset_t mycpus; + opal_value_t val; /* Determine if we were pre-bound or not - this also indicates * that we were launched via mpirun, bound or not */ @@ -66,23 +67,39 @@ int orte_ess_base_proc_binding(void) goto error; } } - if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { - /* print out a shorthand notation to avoid pulling in the entire topology tree */ - map = NULL; - OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, - ORTE_PROC_MY_NAME, &map, OPAL_STRING); - if (OPAL_SUCCESS == ret && NULL != map) { + /* get our cpuset */ + if (NULL != orte_process_info.cpuset) { + free(orte_process_info.cpuset); + orte_process_info.cpuset = NULL; + } + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_CPUSET, + ORTE_PROC_MY_NAME, &orte_process_info.cpuset, OPAL_STRING); + /* try to get our locality as well */ + map = NULL; + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, + ORTE_PROC_MY_NAME, &map, OPAL_STRING); + if (OPAL_SUCCESS == ret && NULL != map) { + /* we were - no need to pull in the topology */ + if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "MCW rank %s bound to %s", ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map); - free(map); - } else { - opal_output(0, "MCW rank %s not bound", ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid)); } + free(map); + } else { + opal_output(0, "MCW rank %s not bound", ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid)); } return ORTE_SUCCESS; } else if (NULL != getenv(OPAL_MCA_PREFIX"orte_externally_bound")) { orte_proc_is_bound = true; - /* see if we were launched by a PMIx-enabled system */ + /* get our cpuset, if available */ + if (NULL != orte_process_info.cpuset) { + free(orte_process_info.cpuset); + orte_process_info.cpuset = NULL; + } + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_CPUSET, + ORTE_PROC_MY_NAME, &orte_process_info.cpuset, OPAL_STRING); + + /* see if we also have our locality - this is the one we require */ map = NULL; OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, ORTE_PROC_MY_NAME, &map, OPAL_STRING); @@ -323,6 +340,17 @@ int orte_ess_base_proc_binding(void) if (NULL != orte_process_info.cpuset) { OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_CPUSET, orte_process_info.cpuset, OPAL_STRING); + /* save our locality string so we can retrieve it elsewhere */ + OBJ_CONSTRUCT(&val, opal_value_t); + val.key = OPAL_PMIX_LOCALITY_STRING; + val.type = OPAL_STRING; + val.data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, orte_process_info.cpuset); + if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &val))) { + ORTE_ERROR_LOG(ret); + } + val.key = NULL; + val.data.string = NULL; + OBJ_DESTRUCT(&val); } return ORTE_SUCCESS; diff --git a/orte/orted/pmix/pmix_server_register_fns.c b/orte/orted/pmix/pmix_server_register_fns.c index 0a0a54d764..f61e1ff4f5 100644 --- a/orte/orted/pmix/pmix_server_register_fns.c +++ b/orte/orted/pmix/pmix_server_register_fns.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2018 Cisco Systems, Inc. All rights reserved * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2014 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science @@ -386,7 +386,12 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata, bool force) kv->type = OPAL_STRING; kv->data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, tmp); opal_list_append(pmap, &kv->super); - free(tmp); + /* pass the cpuset itself as well */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_CPUSET); + kv->type = OPAL_STRING; + kv->data.string = tmp; + opal_list_append(pmap, &kv->super); } else { /* the proc is not bound */ kv = OBJ_NEW(opal_value_t);