1
1

Only instantiate the HWLOC topology in an MPI process if it actually will be used.

There are only five places in the non-daemon code paths where opal_hwloc_topology is currently referenced:

* shared memory BTLs (sm, smcuda). I have added a code path to those components that uses the location string
  instead of the topology itself, if available, thus avoiding instantiating the topology

* openib BTL. This uses the distance matrix. At present, I haven't developed a method
  for replacing that reference. Thus, this component will instantiate the topology

* usnic BTL. Uses the distance matrix.

* treematch TOPO component. Does some complex tree-based algorithm, so it will instantiate
  the topology

* ess base functions. If a process is direct launched and not bound at launch, this
  code attempts to bind it. Thus, procs in this scenario will instantiate the
  topology

Note that instantiating the topology on complex chips such as KNL can consume
megabytes of memory.

Fix pernode binding policy

Properly handle the unbound case

Correct pointer usage

Do not free static error messages!

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2016-12-29 07:31:35 -08:00
родитель 52533f755e
Коммит fe68f23099
14 изменённых файлов: 309 добавлений и 197 удалений

Просмотреть файл

@ -15,6 +15,7 @@
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -51,7 +52,7 @@ static void out(char *str, char *arg);
void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
int *error_code, ...)
int *error_code, ...)
{
char *name;
struct ompi_communicator_t *abort_comm;
@ -72,7 +73,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
int *error_code, ...)
int *error_code, ...)
{
char *name;
struct ompi_communicator_t *abort_comm;
@ -93,7 +94,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
int *error_code, ...)
int *error_code, ...)
{
char *name;
struct ompi_communicator_t *abort_comm = NULL;
@ -111,7 +112,7 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
}
void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
int *error_code, ...)
int *error_code, ...)
{
/* Don't need anything more -- just need this function to exist */
/* Silence some compiler warnings */
@ -123,7 +124,7 @@ void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
int *error_code, ...)
int *error_code, ...)
{
/* Don't need anything more -- just need this function to exist */
/* Silence some compiler warnings */
@ -135,7 +136,7 @@ void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
int *error_code, ...)
int *error_code, ...)
{
/* Don't need anything more -- just need this function to exist */
/* Silence some compiler warnings */
@ -181,6 +182,7 @@ static void backend_fatal_aggregate(char *type,
const char* const unknown_error_code = "Error code: %d (no associated error message)";
const char* const unknown_error = "Unknown error";
const char* const unknown_prefix = "[?:?]";
bool generated = false;
// these do not own what they point to; they're
// here to avoid repeating expressions such as
@ -209,6 +211,8 @@ static void backend_fatal_aggregate(char *type,
err_msg = NULL;
opal_output(0, "%s", "Could not write to err_msg");
opal_output(0, unknown_error_code, *error_code);
} else {
generated = true;
}
}
}
@ -254,7 +258,9 @@ static void backend_fatal_aggregate(char *type,
}
free(prefix);
free(err_msg);
if (generated) {
free(err_msg);
}
}
/*

Просмотреть файл

@ -4,6 +4,7 @@
* reserved.
* Copyright (c) 2011-2015 INRIA. All rights reserved.
* Copyright (c) 2011-2015 Université Bordeaux 1
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -61,9 +62,6 @@ mca_topo_treematch_component_2_2_0_t mca_topo_treematch_component =
static int init_query(bool enable_progress_threads, bool enable_mpi_threads)
{
if(NULL == opal_hwloc_topology) {
return OPAL_ERR_NOT_SUPPORTED;
}
return OMPI_SUCCESS;
}
@ -97,4 +95,3 @@ static int mca_topo_treematch_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY, &mca_topo_treematch_component.reorder_mode);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -5,7 +5,7 @@
* reserved.
* Copyright (c) 2011-2015 INRIA. All rights reserved.
* Copyright (c) 2012-2015 Bordeaux Poytechnic Institute
* Copyright (c) 2015 Intel, Inc. All rights reserved
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
@ -256,7 +256,9 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
/* Then, we need to know if the processes are bound */
/* We make the hypothesis that all processes are in */
/* the same state : all bound or none bound */
assert(NULL != opal_hwloc_topology);
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
goto fallback;
}
root_obj = hwloc_get_root_obj(opal_hwloc_topology);
if (NULL == root_obj) goto fallback;
@ -873,7 +875,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
if( -1 == hwloc_err) goto fallback;
/* Report new binding to ORTE/OPAL */
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
err = hwloc_bitmap_snprintf (set_as_string,64,set);
#ifdef __DEBUG__

Просмотреть файл

@ -508,16 +508,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
/* check for timing request - get stop time and report elapsed time if so */
OPAL_TIMING_MNEXT((&tm,"time from completion of rte_init to modex"));
/* if hwloc is available but didn't get setup for some
* reason, do so now
*/
if (NULL == opal_hwloc_topology) {
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
error = "Topology init";
goto error;
}
}
/* Register the default errhandler callback */
errtrk.status = OPAL_ERROR;
errtrk.active = true;

Просмотреть файл

@ -18,7 +18,7 @@
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014 Bull SAS. All rights reserved.
@ -1502,13 +1502,33 @@ static uint64_t read_module_param(char *file, uint64_t value, uint64_t max)
static uint64_t calculate_total_mem (void)
{
hwloc_obj_t machine;
int rc;
uint64_t mem, *mptr;
opal_process_name_t wildcard_rank;
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL == machine) {
return 0;
/* first try to retrieve it from PMIx as it may have
* been provided */
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
mptr = &mem;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_AVAIL_PHYS_MEMORY,
&wildcard_rank, &mptr, OPAL_UINT64);
if (OPAL_SUCCESS == rc) {
return mem;
}
return machine->memory.total_memory;
/* if not available, then ensure that the topology has been
* loaded and try to get it from there */
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL == machine) {
return 0;
}
return machine->memory.total_memory;
}
/* couldn't find it */
return 0;
}
@ -2312,7 +2332,8 @@ static float get_ib_dev_distance(struct ibv_device *dev)
float distance = 0;
/* Override any distance logic so all devices are used */
if (0 != mca_btl_openib_component.ignore_locality) {
if (0 != mca_btl_openib_component.ignore_locality ||
OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
return distance;
}

Просмотреть файл

@ -52,7 +52,7 @@
#include "opal/util/show_help.h"
#include "opal/util/printf.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/pmix/base/base.h"
#include "opal/mca/shmem/base/base.h"
#include "opal/mca/shmem/shmem.h"
@ -242,7 +242,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
free(loc);
} else {
/* If we have hwloc support, then get accurate information */
if (NULL != opal_hwloc_topology) {
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_AVAILABLE);
@ -257,6 +257,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
}
}
/* see if we were given our location */
loc = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
&OPAL_PROC_MY_NAME, &loc, OPAL_STRING);
if (OPAL_SUCCESS == rc) {
@ -283,8 +284,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
}
} else {
/* If we have hwloc support, then get accurate information */
if (NULL != opal_hwloc_topology && num_mem_nodes > 0 &&
NULL != opal_process_info.cpuset) {
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && num_mem_nodes > 0) {
int numa=0, w;
unsigned n_bound=0;
hwloc_cpuset_t avail;

Просмотреть файл

@ -18,7 +18,7 @@
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -48,6 +48,7 @@
#include "opal/util/show_help.h"
#include "opal/util/printf.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/mca/pmix/base/base.h"
#include "opal/mca/shmem/base/base.h"
#include "opal/mca/shmem/shmem.h"
#include "opal/datatype/opal_convertor.h"
@ -232,23 +233,28 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
int my_mem_node, num_mem_nodes, i, rc;
mca_common_sm_mpool_resources_t *res = NULL;
mca_btl_smcuda_component_t* m = &mca_btl_smcuda_component;
char *loc, *mynuma;
opal_process_name_t wildcard_rank;
/* Assume we don't have hwloc support and fill in dummy info */
mca_btl_smcuda_component.mem_node = my_mem_node = 0;
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = 1;
/* If we have hwloc support, then get accurate information */
if (NULL != opal_hwloc_topology) {
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_AVAILABLE);
/* If we find >0 NUMA nodes, then investigate further */
if (i > 0) {
int numa=0, w;
unsigned n_bound=0;
hwloc_cpuset_t avail;
hwloc_obj_t obj;
/* see if we were given a topology signature */
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_TOPOLOGY_SIGNATURE,
&wildcard_rank, &loc, OPAL_STRING);
if (OPAL_SUCCESS == rc) {
/* the number of NUMA nodes is right at the front */
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = strtoul(loc, NULL, 10);
free(loc);
} else {
/* If we have hwloc support, then get accurate information */
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_AVAILABLE);
/* JMS This tells me how many numa nodes are *available*,
but it's not how many are being used *by this job*.
@ -257,33 +263,65 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
should be improved to be how many NUMA nodes are being
used *in this job*. */
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = i;
}
}
/* see if we were given our location */
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
&OPAL_PROC_MY_NAME, &loc, OPAL_STRING);
if (OPAL_SUCCESS == rc) {
if (NULL == loc) {
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
} else {
/* get our NUMA location */
mynuma = opal_hwloc_base_get_location(loc, HWLOC_OBJ_NODE, 0);
if (NULL == mynuma ||
NULL != strchr(mynuma, ',') ||
NULL != strchr(mynuma, '-')) {
/* we either have no idea what NUMA we are on, or we
* are on multiple NUMA nodes */
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
} else {
/* we are bound to a single NUMA node */
my_mem_node = strtoul(mynuma, NULL, 10);
mca_btl_smcuda_component.mem_node = my_mem_node;
}
if (NULL != mynuma) {
free(mynuma);
}
free(loc);
}
} else {
/* If we have hwloc support, then get accurate information */
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() &&
num_mem_nodes > 0 && NULL != opal_process_info.cpuset) {
int numa=0, w;
unsigned n_bound=0;
hwloc_cpuset_t avail;
hwloc_obj_t obj;
/* if we are not bound, then there is nothing further to do */
if (NULL != opal_process_info.cpuset) {
/* count the number of NUMA nodes to which we are bound */
for (w=0; w < i; w++) {
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0, w,
OPAL_HWLOC_AVAILABLE))) {
continue;
}
/* get that NUMA node's available cpus */
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
/* see if we intersect */
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
n_bound++;
numa = w;
}
/* count the number of NUMA nodes to which we are bound */
for (w=0; w < i; w++) {
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
HWLOC_OBJ_NODE, 0, w,
OPAL_HWLOC_AVAILABLE))) {
continue;
}
/* if we are located on more than one NUMA, or we didn't find
* a NUMA we are on, then not much we can do
*/
if (1 == n_bound) {
mca_btl_smcuda_component.mem_node = my_mem_node = numa;
} else {
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
/* get that NUMA node's available cpus */
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
/* see if we intersect */
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
n_bound++;
numa = w;
}
}
/* if we are located on more than one NUMA, or we didn't find
* a NUMA we are on, then not much we can do
*/
if (1 == n_bound) {
mca_btl_smcuda_component.mem_node = my_mem_node = numa;
} else {
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
}
}
}
@ -431,7 +469,7 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
mca_btl_smcuda_component.sm_free_list_inc,
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
if ( OPAL_SUCCESS != i )
return i;
return i;
mca_btl_smcuda_component.num_outstanding_frags = 0;
@ -1120,8 +1158,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
mca_common_wait_stream_synchronize(&rget_reg);
rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
&done);
"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
&done);
if (OPAL_SUCCESS != rc) {
/* Out of resources can be handled by upper layers. */
if (OPAL_ERR_OUT_OF_RESOURCE != rc) {

Просмотреть файл

@ -1,5 +1,6 @@
/*
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -9,7 +10,7 @@
#include "opal_config.h"
#include "opal/mca/hwloc/hwloc.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/constants.h"
#if BTL_IN_OPAL
@ -191,6 +192,13 @@ int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module)
opal_output_verbose(5, USNIC_OUT,
"btl:usnic:filter_numa: filtering devices by NUMA distance");
/* ensure we have the topology */
if (OPAL_SUCCESS !=- opal_hwloc_base_get_topology()) {
opal_output_verbose(5, USNIC_OUT,
"btl:usnic:filter_numa: not sorting devices by NUMA distance (topology not available)");
return OPAL_SUCCESS;
}
/* Get the hwloc distance matrix for all NUMA nodes */
if (OPAL_SUCCESS != (ret = get_distance_matrix())) {
return ret;

Просмотреть файл

@ -40,6 +40,7 @@
#include "opal/util/os_dirpath.h"
#include "opal/util/show_help.h"
#include "opal/threads/tsd.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/hwloc/hwloc.h"
#include "opal/mca/hwloc/base/base.h"
@ -240,12 +241,65 @@ static void fill_cache_line_size(void)
int opal_hwloc_base_get_topology(void)
{
int rc=OPAL_SUCCESS;
int rc;
opal_process_name_t wildcard_rank;
char *val = NULL;
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
OPAL_OUTPUT_VERBOSE((2, opal_hwloc_base_framework.framework_output,
"hwloc:base:get_topology"));
if (NULL == opal_hwloc_base_topo_file) {
/* see if we already got it */
if (NULL != opal_hwloc_topology) {
return OPAL_SUCCESS;
}
if (NULL != opal_pmix.get) {
/* try to retrieve it from the PMIx store */
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
"hwloc:base instantiating topology");
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCAL_TOPO,
&wildcard_rank, &val, OPAL_STRING);
} else {
rc = OPAL_ERR_NOT_SUPPORTED;
}
if (OPAL_SUCCESS == rc && NULL != val) {
/* load the topology */
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
free(val);
return OPAL_ERROR;
}
if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) {
free(val);
hwloc_topology_destroy(opal_hwloc_topology);
return OPAL_ERROR;
}
/* since we are loading this from an external source, we have to
* explicitly set a flag so hwloc sets things up correctly
*/
if (0 != hwloc_topology_set_flags(opal_hwloc_topology,
(HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
return OPAL_ERROR;
}
/* now load the topology */
if (0 != hwloc_topology_load(opal_hwloc_topology)) {
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
return OPAL_ERROR;
}
free(val);
/* filter the cpus thru any default cpu set */
if (OPAL_SUCCESS != (rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
hwloc_topology_destroy(opal_hwloc_topology);
return rc;
}
} else if (NULL == opal_hwloc_base_topo_file) {
if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
0 != hwloc_topology_set_flags(opal_hwloc_topology,
(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
@ -266,7 +320,12 @@ int opal_hwloc_base_get_topology(void)
line size */
fill_cache_line_size();
return rc;
/* get or update our local cpuset - it will get used multiple
* times, so it's more efficient to keep a global copy
*/
opal_hwloc_base_get_local_cpuset();
return OPAL_SUCCESS;
}
int opal_hwloc_base_set_topology(char *topofile)

Просмотреть файл

@ -106,6 +106,7 @@ BEGIN_C_DECLS
#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs
#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string
#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location
#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node
#define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace
#define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -56,7 +56,8 @@ int orte_ess_base_proc_binding(void)
char *error=NULL;
hwloc_cpuset_t mycpus;
/* Determine if we were pre-bound or not */
/* Determine if we were pre-bound or not - this also indicates
* that we were launched via mpirun, bound or not */
if (NULL != getenv(OPAL_MCA_PREFIX"orte_bound_at_launch")) {
orte_proc_is_bound = true;
if (NULL != (map = getenv(OPAL_MCA_PREFIX"orte_base_applied_binding"))) {
@ -66,21 +67,49 @@ int orte_ess_base_proc_binding(void)
goto error;
}
}
if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
/* print out a shorthand notation to avoid pulling in the entire topology tree */
map = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
ORTE_PROC_MY_NAME, &map, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != map) {
opal_output(0, "MCW rank %s bound to %s",
ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map);
free(map);
} else {
opal_output(0, "MCW rank %s not bound", ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid));
}
}
return ORTE_SUCCESS;
} else if (NULL != getenv(OPAL_MCA_PREFIX"orte_externally_bound")) {
orte_proc_is_bound = true;
/* see if we were launched by a PMIx-enabled system */
map = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
ORTE_PROC_MY_NAME, &map, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != map) {
/* we were - no need to pull in the topology */
if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
opal_output(0, "MCW rank %s bound to %s",
ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map);
}
free(map);
return ORTE_SUCCESS;
}
/* the topology system will pickup the binding pattern */
}
/* load the topology as we will likely need it */
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
/* there is nothing we can do, so just return */
return ORTE_SUCCESS;
}
/* see if we were bound when launched */
if (!orte_proc_is_bound) {
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
"%s Not bound at launch",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* we were not bound at launch */
if (NULL == opal_hwloc_topology) {
/* there is nothing we can do, so just return */
return ORTE_SUCCESS;
}
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
/* get our node object */
node = hwloc_get_root_obj(opal_hwloc_topology);
@ -257,11 +286,6 @@ int orte_ess_base_proc_binding(void)
}
MOVEON:
/* get or update our local cpuset - it will get used multiple
* times, so it's more efficient to keep a global copy
*/
opal_hwloc_base_get_local_cpuset();
/* get the cpus we are bound to */
mycpus = hwloc_bitmap_alloc();
if (hwloc_get_cpubind(opal_hwloc_topology,

Просмотреть файл

@ -302,75 +302,6 @@ static int rte_init(void)
}
}
/* retrieve our topology */
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO,
&wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
/* load the topology */
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
ret = OPAL_ERROR;
free(val);
error = "setting topology";
goto error;
}
if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) {
ret = OPAL_ERROR;
free(val);
hwloc_topology_destroy(opal_hwloc_topology);
error = "setting topology";
goto error;
}
/* since we are loading this from an external source, we have to
* explicitly set a flag so hwloc sets things up correctly
*/
if (0 != hwloc_topology_set_flags(opal_hwloc_topology,
(HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
ret = OPAL_ERROR;
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
error = "setting topology";
goto error;
}
/* now load the topology */
if (0 != hwloc_topology_load(opal_hwloc_topology)) {
ret = OPAL_ERROR;
hwloc_topology_destroy(opal_hwloc_topology);
free(val);
error = "setting topology";
goto error;
}
free(val);
/* filter the cpus thru any default cpu set */
if (OPAL_SUCCESS != (ret = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
error = "filtering topology";
goto error;
}
} else {
/* it wasn't passed down to us, so go get it */
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
error = "topology discovery";
goto error;
}
/* push it into the PMIx database in case someone
* tries to retrieve it so we avoid an attempt to
* get it again */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCAL_TOPO);
kv->type = OPAL_STRING;
if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) {
error = "topology export";
goto error;
}
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&wildcard_rank, kv))) {
error = "topology store";
goto error;
}
OBJ_RELEASE(kv);
}
/* get our local peers */
if (0 < orte_process_info.num_local_peers) {
/* if my local rank if too high, then that's an error */

Просмотреть файл

@ -190,6 +190,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
/* if the user specified a default binding policy via
* MCA param, then we use it - this can include a directive
* to overload */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding policy given", __LINE__);
jdata->map->binding = opal_hwloc_binding_policy;
} else if (1 < jdata->map->cpus_per_rank) {
/* bind to cpus */
@ -238,6 +240,26 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding not given - using bynuma", __LINE__);
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA);
} else {
/* we are mapping by node or some other non-object method */
if (nprocs <= 2) {
if (opal_hwloc_use_hwthreads_as_cpus) {
/* if we are using hwthread cpus, then bind to those */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding not given - using byhwthread", __LINE__);
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_HWTHREAD);
} else {
/* for performance, bind to core */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding not given - using bycore", __LINE__);
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CORE);
}
} else {
/* for performance, bind to NUMA */
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps[%d] binding not given - using bynuma", __LINE__);
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA);
}
}
} else if (nprocs <= 2) {
if (opal_hwloc_use_hwthreads_as_cpus) {

Просмотреть файл

@ -67,6 +67,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
uid_t uid;
gid_t gid;
opal_list_t *cache;
hwloc_obj_t machine;
opal_output_verbose(2, orte_pmix_server_globals.output,
"%s register nspace for %s",
@ -247,6 +248,16 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.string = strdup(orte_topo_signature);
opal_list_append(info, &kv->super);
/* total available physical memory */
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
if (NULL != machine) {
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_AVAIL_PHYS_MEMORY);
kv->type = OPAL_UINT64;
kv->data.uint64 = machine->memory.total_memory;
opal_list_append(info, &kv->super);
}
/* register any local clients */
vpid = ORTE_VPID_MAX;
micro = NULL;
@ -328,44 +339,53 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, tmp);
opal_list_append(pmap, &kv->super);
free(tmp);
} else {
/* the proc is not bound */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_LOCALITY_STRING);
kv->type = OPAL_STRING;
kv->data.string = NULL;
opal_list_append(pmap, &kv->super);
}
}
/* appnum */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPNUM);
kv->type = OPAL_UINT32;
kv->data.uint32 = pptr->app_idx;
opal_list_append(pmap, &kv->super);
if (1 < jdata->num_apps) {
/* appnum */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPNUM);
kv->type = OPAL_UINT32;
kv->data.uint32 = pptr->app_idx;
opal_list_append(pmap, &kv->super);
/* app ldr */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = app->first_rank;
opal_list_append(pmap, &kv->super);
/* app ldr */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APPLDR);
kv->type = OPAL_VPID;
kv->data.name.vpid = app->first_rank;
opal_list_append(pmap, &kv->super);
/* global/univ rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->name.vpid + jdata->offset;
opal_list_append(pmap, &kv->super);
/* global/univ rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->name.vpid + jdata->offset;
opal_list_append(pmap, &kv->super);
/* app rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->app_rank;
opal_list_append(pmap, &kv->super);
/* app rank */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_RANK);
kv->type = OPAL_VPID;
kv->data.name.vpid = pptr->app_rank;
opal_list_append(pmap, &kv->super);
/* app size */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_SIZE);
kv->type = OPAL_UINT32;
kv->data.uint32 = app->num_procs;
opal_list_append(info, &kv->super);
/* app size */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_APP_SIZE);
kv->type = OPAL_UINT32;
kv->data.uint32 = app->num_procs;
opal_list_append(info, &kv->super);
}
/* local rank */
kv = OBJ_NEW(opal_value_t);
@ -381,13 +401,6 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
kv->data.uint32 = pptr->node_rank;
opal_list_append(pmap, &kv->super);
/* hostname */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_HOSTNAME);
kv->type = OPAL_STRING;
kv->data.string = strdup(pptr->node->name);
opal_list_append(pmap, &kv->super);
/* node ID */
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_NODEID);