1
1

Merge pull request #2648 from rhc54/topic/topo

Only instantiate the HWLOC topology in an MPI process if it actually will be used.
Этот коммит содержится в:
rhc54 2016-12-29 11:52:08 -08:00 коммит произвёл GitHub
родитель 52533f755e fe68f23099
Коммит a16162832b
14 изменённых файлов: 309 добавлений и 197 удалений

Просмотреть файл

@ -15,6 +15,7 @@
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -51,7 +52,7 @@ static void out(char *str, char *arg);
void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
int *error_code, ...)
int *error_code, ...)
{
char *name;
struct ompi_communicator_t *abort_comm;
@ -72,7 +73,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
int *error_code, ...)
int *error_code, ...)
{
char *name;
struct ompi_communicator_t *abort_comm;
@ -93,7 +94,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
int *error_code, ...)
int *error_code, ...)
{
char *name;
struct ompi_communicator_t *abort_comm = NULL;
@ -111,7 +112,7 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
}
void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
int *error_code, ...)
int *error_code, ...)
{
/* Don't need anything more -- just need this function to exist */
/* Silence some compiler warnings */
@ -123,7 +124,7 @@ void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
int *error_code, ...)
int *error_code, ...)
{
/* Don't need anything more -- just need this function to exist */
/* Silence some compiler warnings */
@ -135,7 +136,7 @@ void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
int *error_code, ...)
int *error_code, ...)
{
/* Don't need anything more -- just need this function to exist */
/* Silence some compiler warnings */
@ -181,6 +182,7 @@ static void backend_fatal_aggregate(char *type,
const char* const unknown_error_code = "Error code: %d (no associated error message)";
const char* const unknown_error = "Unknown error";
const char* const unknown_prefix = "[?:?]";
bool generated = false;
// these do not own what they point to; they're
// here to avoid repeating expressions such as
@ -209,6 +211,8 @@ static void backend_fatal_aggregate(char *type,
err_msg = NULL;
opal_output(0, "%s", "Could not write to err_msg");
opal_output(0, unknown_error_code, *error_code);
} else {
generated = true;
}
}
}
@ -254,7 +258,9 @@ static void backend_fatal_aggregate(char *type,
}
free(prefix);
free(err_msg);
if (generated) {
free(err_msg);
}
}
/*

Просмотреть файл

@ -4,6 +4,7 @@
* reserved.
* Copyright (c) 2011-2015 INRIA. All rights reserved.
* Copyright (c) 2011-2015 Université Bordeaux 1
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -61,9 +62,6 @@ mca_topo_treematch_component_2_2_0_t mca_topo_treematch_component =
static int init_query(bool enable_progress_threads, bool enable_mpi_threads)
{
if(NULL == opal_hwloc_topology) {
return OPAL_ERR_NOT_SUPPORTED;
}
return OMPI_SUCCESS;
}
@ -97,4 +95,3 @@ static int mca_topo_treematch_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY, &mca_topo_treematch_component.reorder_mode);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -5,7 +5,7 @@
* reserved.
* Copyright (c) 2011-2015 INRIA. All rights reserved.
* Copyright (c) 2012-2015 Bordeaux Poytechnic Institute
* Copyright (c) 2015 Intel, Inc. All rights reserved
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
@ -256,7 +256,9 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
/* Then, we need to know if the processes are bound */
/* We make the hypothesis that all processes are in */
/* the same state : all bound or none bound */
assert(NULL != opal_hwloc_topology);
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
goto fallback;
}
root_obj = hwloc_get_root_obj(opal_hwloc_topology);
if (NULL == root_obj) goto fallback;
@ -873,7 +875,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
if( -1 == hwloc_err) goto fallback;
/* Report new binding to ORTE/OPAL */
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
err = hwloc_bitmap_snprintf (set_as_string,64,set);
#ifdef __DEBUG__

Просмотреть файл

@ -508,16 +508,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
/* check for timing request - get stop time and report elapsed time if so */
OPAL_TIMING_MNEXT((&tm,"time from completion of rte_init to modex"));
/* if hwloc is available but didn't get setup for some
* reason, do so now
*/
if (NULL == opal_hwloc_topology) {
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
error = "Topology init";
goto error;
}
}
/* Register the default errhandler callback */
errtrk.status = OPAL_ERROR;
errtrk.active = true;

Просмотреть файл

@ -18,7 +18,7 @@
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014 Bull SAS. All rights reserved.
@ -1502,13 +1502,33 @@ static uint64_t read_module_param(char *file, uint64_t value, uint64_t max)
static uint64_t calculate_total_mem (void)
{
hwloc_obj_t machine;
int rc;
uint64_t mem, *mptr;
opal_process_name_t wildcard_rank;
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL == machine) {
return 0;
/* first try to retrieve it from PMIx as it may have
* been provided */
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
mptr = &mem;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_AVAIL_PHYS_MEMORY,
&wildcard_rank, &mptr, OPAL_UINT64);
if (OPAL_SUCCESS == rc) {
return mem;
}
return machine->memory.total_memory;
/* if not available, then ensure that the topology has been
* loaded and try to get it from there */
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL == machine) {
return 0;
}
return machine->memory.total_memory;
}
/* couldn't find it */
return 0;
}
@ -2312,7 +2332,8 @@ static float get_ib_dev_distance(struct ibv_device *dev)
float distance = 0;
/* Override any distance logic so all devices are used */
if (0 != mca_btl_openib_component.ignore_locality) {
if (0 != mca_btl_openib_component.ignore_locality ||
OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
return distance;
}

Просмотреть файл

@ -52,7 +52,7 @@
#include "opal/util/show_help.h"
#include "opal/util/printf.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/pmix/base/base.h"
#include "opal/mca/shmem/base/base.h"
#include "opal/mca/shmem/shmem.h"
@ -242,7 +242,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
free(loc);
} else {
/* If we have hwloc support, then get accurate information */
if (NULL != opal_hwloc_topology) {
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_AVAILABLE);
@ -257,6 +257,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
}
}
/* see if we were given our location */
loc = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
&OPAL_PROC_MY_NAME, &loc, OPAL_STRING);
if (OPAL_SUCCESS == rc) {
@ -283,8 +284,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
}
} else {
/* If we have hwloc support, then get accurate information */
if (NULL != opal_hwloc_topology && num_mem_nodes > 0 &&
NULL != opal_process_info.cpuset) {
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && num_mem_nodes > 0) {
int numa=0, w;
unsigned n_bound=0;
hwloc_cpuset_t avail;

Просмотреть файл

@ -18,7 +18,7 @@
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -48,6 +48,7 @@
#include "opal/util/show_help.h"
#include "opal/util/printf.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/base/base.h"
#include "opal/mca/shmem/base/base.h"
#include "opal/mca/shmem/shmem.h"
#include "opal/datatype/opal_convertor.h"
@ -232,23 +233,28 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
int my_mem_node, num_mem_nodes, i, rc;
mca_common_sm_mpool_resources_t *res = NULL;
mca_btl_smcuda_component_t* m = &mca_btl_smcuda_component;
char *loc, *mynuma;
opal_process_name_t wildcard_rank;
/* Assume we don't have hwloc support and fill in dummy info */
mca_btl_smcuda_component.mem_node = my_mem_node = 0;
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = 1;
/* If we have hwloc support, then get accurate information */
if (NULL != opal_hwloc_topology) {
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_AVAILABLE);
/* If we find >0 NUMA nodes, then investigate further */
if (i > 0) {
int numa=0, w;
unsigned n_bound=0;
hwloc_cpuset_t avail;
hwloc_obj_t obj;
/* see if we were given a topology signature */
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_TOPOLOGY_SIGNATURE,
&wildcard_rank, &loc, OPAL_STRING);
if (OPAL_SUCCESS == rc) {
/* the number of NUMA nodes is right at the front */
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = strtoul(loc, NULL, 10);
free(loc);
} else {
/* If we have hwloc support, then get accurate information */
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_AVAILABLE);
/* JMS This tells me how many numa nodes are *available*,
but it's not how many are being used *by this job*.
@ -257,33 +263,65 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
should be improved to be how many NUMA nodes are being
used *in this job*. */
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = i;
}
}
/* see if we were given our location */
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
&OPAL_PROC_MY_NAME, &loc, OPAL_STRING);
if (OPAL_SUCCESS == rc) {
if (NULL == loc) {
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
} else {
/* get our NUMA location */
mynuma = opal_hwloc_base_get_location(loc, HWLOC_OBJ_NODE, 0);
if (NULL == mynuma ||
NULL != strchr(mynuma, ',') ||
NULL != strchr(mynuma, '-')) {
/* we either have no idea what NUMA we are on, or we
* are on multiple NUMA nodes */
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
} else {
/* we are bound to a single NUMA node */
my_mem_node = strtoul(mynuma, NULL, 10);
mca_btl_smcuda_component.mem_node = my_mem_node;
}
if (NULL != mynuma) {
free(mynuma);
}
free(loc);
}
} else {
/* If we have hwloc support, then get accurate information */
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() &&
num_mem_nodes > 0 && NULL != opal_process_info.cpuset) {
int numa=0, w;
unsigned n_bound=0;
hwloc_cpuset_t avail;
hwloc_obj_t obj;
/* if we are not bound, then there is nothing further to do */
if (NULL != opal_process_info.cpuset) {
/* count the number of NUMA nodes to which we are bound */
for (w=0; w < i; w++) {
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0, w,
OPAL_HWLOC_AVAILABLE))) {
continue;
}
/* get that NUMA node's available cpus */
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
/* see if we intersect */
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
n_bound++;
numa = w;
}
/* count the number of NUMA nodes to which we are bound */
for (w=0; w < i; w++) {
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0, w,
OPAL_HWLOC_AVAILABLE))) {
continue;
}
/* if we are located on more than one NUMA, or we didn't find
* a NUMA we are on, then not much we can do
*/
if (1 == n_bound) {
mca_btl_smcuda_component.mem_node = my_mem_node = numa;
} else {
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
/* get that NUMA node's available cpus */
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
/* see if we intersect */
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
n_bound++;
numa = w;
}
}
/* if we are located on more than one NUMA, or we didn't find
* a NUMA we are on, then not much we can do
*/
if (1 == n_bound) {
mca_btl_smcuda_component.mem_node = my_mem_node = numa;
} else {
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
}
}
}
@ -431,7 +469,7 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
mca_btl_smcuda_component.sm_free_list_inc,
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
if ( OPAL_SUCCESS != i )
return i;
return i;
mca_btl_smcuda_component.num_outstanding_frags = 0;
@ -1120,8 +1158,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
mca_common_wait_stream_synchronize(&rget_reg);
rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
&done);
"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
&done);
if (OPAL_SUCCESS != rc) {
/* Out of resources can be handled by upper layers. */
if (OPAL_ERR_OUT_OF_RESOURCE != rc) {

Просмотреть файл

@ -1,5 +1,6 @@
/*
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -9,7 +10,7 @@
#include "opal_config.h"
#include "opal/mca/hwloc/hwloc.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/constants.h"
#if BTL_IN_OPAL
@ -191,6 +192,13 @@ int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module)
opal_output_verbose(5, USNIC_OUT,
"btl:usnic:filter_numa: filtering devices by NUMA distance");
/* ensure we have the topology */
if (OPAL_SUCCESS !=- opal_hwloc_base_get_topology()) {
opal_output_verbose(5, USNIC_OUT,
"btl:usnic:filter_numa: not sorting devices by NUMA distance (topology not available)");
return OPAL_SUCCESS;
}
/* Get the hwloc distance matrix for all NUMA nodes */
if (OPAL_SUCCESS != (ret = get_distance_matrix())) {
return ret;

Просмотреть файл

@ -40,6 +40,7 @@
#include "opal/util/os_dirpath.h"
#include "opal/util/show_help.h"
#include "opal/threads/tsd.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/hwloc/hwloc.h"
#include "opal/mca/hwloc/base/base.h"
@ -240,12 +241,65 @@ static void fill_cache_line_size(void)
int opal_hwloc_base_get_topology(void)
{
int rc=OPAL_SUCCESS;
int rc;
opal_process_name_t wildcard_rank;
char *val = NULL;
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
OPAL_OUTPUT_VERBOSE((2, opal_hwloc_base_framework.framework_output,
"hwloc:base:get_topology"));
if (NULL == opal_hwloc_base_topo_file) {
/* see if we already got it */
if (NULL != opal_hwloc_topology) {
return OPAL_SUCCESS;
}
if (NULL != opal_pmix.get) {
/* try to retrieve it from the PMIx store */
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
"hwloc:base instantiating topology");
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCAL_TOPO,
&wildcard_rank, &val, OPAL_STRING);
} else {
rc = OPAL_ERR_NOT_SUPPORTED;
}
if (OPAL_SUCCESS == rc && NULL != val) {
/* load the topology */
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
free(val);
return OPAL_ERROR;
}
if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) {
free(val);
hwloc_topology_destroy(opal_hwloc_topology);
return OPAL_ERROR;
}
/* since we are loading this from an external source, we have to
* explicitly set a flag so hwloc sets things up correctly
*/
if (0 != hwloc_topology_set_flags(opal_hwloc_topology,
(HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
return OPAL_ERROR;
}
/* now load the topology */
if (0 != hwloc_topology_load(opal_hwloc_topology)) {
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
return OPAL_ERROR;
}
free(val);
/* filter the cpus thru any default cpu set */
if (OPAL_SUCCESS != (rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
hwloc_topology_destroy(opal_hwloc_topology);
return rc;
}
} else if (NULL == opal_hwloc_base_topo_file) {
if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
0 != hwloc_topology_set_flags(opal_hwloc_topology,
(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
@ -266,7 +320,12 @@ int opal_hwloc_base_get_topology(void)
line size */
fill_cache_line_size();
return rc;
/* get or update our local cpuset - it will get used multiple
* times, so it's more efficient to keep a global copy
*/
opal_hwloc_base_get_local_cpuset();
return OPAL_SUCCESS;
}
int opal_hwloc_base_set_topology(char *topofile)

Просмотреть файл

@ -106,6 +106,7 @@ BEGIN_C_DECLS
#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs
#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string
#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location
#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node
#define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace
#define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -56,7 +56,8 @@ int orte_ess_base_proc_binding(void)
char *error=NULL;
hwloc_cpuset_t mycpus;
/* Determine if we were pre-bound or not */
/* Determine if we were pre-bound or not - this also indicates
* that we were launched via mpirun, bound or not */
if (NULL != getenv(OPAL_MCA_PREFIX"orte_bound_at_launch")) {
orte_proc_is_bound = true;
if (NULL != (map = getenv(OPAL_MCA_PREFIX"orte_base_applied_binding"))) {
@ -66,21 +67,49 @@ int orte_ess_base_proc_binding(void)
goto error;
}
}
if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
/* print out a shorthand notation to avoid pulling in the entire topology tree */
map = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
ORTE_PROC_MY_NAME, &map, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != map) {
opal_output(0, "MCW rank %s bound to %s",
ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map);
free(map);
} else {
opal_output(0, "MCW rank %s not bound", ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid));
}
}
return ORTE_SUCCESS;
} else if (NULL != getenv(OPAL_MCA_PREFIX"orte_externally_bound")) {
orte_proc_is_bound = true;
/* see if we were launched by a PMIx-enabled system */
map = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
ORTE_PROC_MY_NAME, &map, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != map) {
/* we were - no need to pull in the topology */
if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
opal_output(0, "MCW rank %s bound to %s",
ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map);
}
free(map);
return ORTE_SUCCESS;
}
/* the topology system will pickup the binding pattern */
}
/* load the topology as we will likely need it */
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
/* there is nothing we can do, so just return */
return ORTE_SUCCESS;
}
/* see if we were bound when launched */
if (!orte_proc_is_bound) {
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
"%s Not bound at launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* we were not bound at launch */
if (NULL == opal_hwloc_topology) {
/* there is nothing we can do, so just return */
return ORTE_SUCCESS;
}
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
/* get our node object */
node = hwloc_get_root_obj(opal_hwloc_topology);
@ -257,11 +286,6 @@ int orte_ess_base_proc_binding(void)
}
MOVEON:
/* get or update our local cpuset - it will get used multiple
* times, so it's more efficient to keep a global copy
*/
opal_hwloc_base_get_local_cpuset();
/* get the cpus we are bound to */
mycpus = hwloc_bitmap_alloc();
if (hwloc_get_cpubind(opal_hwloc_topology,

Просмотреть файл

@ -302,75 +302,6 @@ static int rte_init(void)
}
}
/* retrieve our topology */
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO,
&wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
/* load the topology */
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
ret = OPAL_ERROR;
free(val);
error = "setting topology";
goto error;
}
if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) {
ret = OPAL_ERROR;
free(val);
hwloc_topology_destroy(opal_hwloc_topology);
error = "setting topology";
goto error;
}
/* since we are loading this from an external source, we have to
* explicitly set a flag so hwloc sets things up correctly
*/
if (0 != hwloc_topology_set_flags(opal_hwloc_topology,
(HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
ret = OPAL_ERROR;
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
error = "setting topology";
goto error;
}
/* now load the topology */
if (0 != hwloc_topology_load(opal_hwloc_topology)) {
ret = OPAL_ERROR;
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
error = "setting topology";
goto error;
}
free(val);
/* filter the cpus thru any default cpu set */
if (OPAL_SUCCESS != (ret = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
error = "filtering topology";
goto error;
}
} else {
/* it wasn't passed down to us, so go get it */
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
error = "topology discovery";
goto error;
}
/* push it into the PMIx database in case someone
* tries to retrieve it so we avoid an attempt to
* get it again */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_TOPO);
kv->type = OPAL_STRING;
if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) {
error = "topology export";
goto error;
}
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&wildcard_rank, kv))) {
error = "topology store";
goto error;
}
OBJ_RELEASE(kv);
}
/* get our local peers */
if (0 < orte_process_info.num_local_peers) {
/* if my local rank if too high, then that's an error */

Просмотреть файл

@ -190,6 +190,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
/* if the user specified a default binding policy via
* MCA param, then we use it - this can include a directive
* to overload */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding policy given", __LINE__);
jdata->map->binding = opal_hwloc_binding_policy;
} else if (1 < jdata->map->cpus_per_rank) {
/* bind to cpus */
@ -238,6 +240,26 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding not given - using bynuma", __LINE__);
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA);
} else {
/* we are mapping by node or some other non-object method */
if (nprocs <= 2) {
if (opal_hwloc_use_hwthreads_as_cpus) {
/* if we are using hwthread cpus, then bind to those */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding not given - using byhwthread", __LINE__);
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_HWTHREAD);
} else {
/* for performance, bind to core */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding not given - using bycore", __LINE__);
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CORE);
}
} else {
/* for performance, bind to NUMA */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding not given - using bynuma", __LINE__);
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA);
}
}
} else if (nprocs <= 2) {
if (opal_hwloc_use_hwthreads_as_cpus) {

Просмотреть файл

@ -67,6 +67,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
uid_t uid;
gid_t gid;
opal_list_t *cache;
hwloc_obj_t machine;
opal_output_verbose(2, orte_pmix_server_globals.output,
"%s register nspace for %s",
@ -247,6 +248,16 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.string = strdup(orte_topo_signature);
opal_list_append(info, &kv->super);
/* total available physical memory */
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL != machine) {
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_AVAIL_PHYS_MEMORY);
kv->type = OPAL_UINT64;
kv->data.uint64 = machine->memory.total_memory;
opal_list_append(info, &kv->super);
}
/* register any local clients */
vpid = ORTE_VPID_MAX;
micro = NULL;
@ -328,44 +339,53 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, tmp);
opal_list_append(pmap, &kv->super);
free(tmp);
} else {
/* the proc is not bound */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALITY_STRING);
kv->type = OPAL_STRING;
kv->data.string = NULL;
opal_list_append(pmap, &kv->super);
}
}
/* appnum */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPNUM);
kv->type = OPAL_UINT32;
kv->data.uint32 = pptr->app_idx;
opal_list_append(pmap, &kv->super);
if (1 < jdata->num_apps) {
/* appnum */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPNUM);
kv->type = OPAL_UINT32;
kv->data.uint32 = pptr->app_idx;
opal_list_append(pmap, &kv->super);
/* app ldr */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = app->first_rank;
opal_list_append(pmap, &kv->super);
/* app ldr */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = app->first_rank;
opal_list_append(pmap, &kv->super);
/* global/univ rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->name.vpid + jdata->offset;
opal_list_append(pmap, &kv->super);
/* global/univ rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->name.vpid + jdata->offset;
opal_list_append(pmap, &kv->super);
/* app rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->app_rank;
opal_list_append(pmap, &kv->super);
/* app rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->app_rank;
opal_list_append(pmap, &kv->super);
/* app size */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_SIZE);
kv->type = OPAL_UINT32;
kv->data.uint32 = app->num_procs;
opal_list_append(info, &kv->super);
/* app size */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_SIZE);
kv->type = OPAL_UINT32;
kv->data.uint32 = app->num_procs;
opal_list_append(info, &kv->super);
}
/* local rank */
kv = OBJ_NEW(opal_value_t);
@ -381,13 +401,6 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.uint32 = pptr->node_rank;
opal_list_append(pmap, &kv->super);
/* hostname */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_HOSTNAME);
kv->type = OPAL_STRING;
kv->data.string = strdup(pptr->node->name);
opal_list_append(pmap, &kv->super);
/* node ID */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODEID);