Merge pull request #2648 from rhc54/topic/topo
Only instantiate the HWLOC topology in an MPI process if it actually will be used.
Этот коммит содержится в:
Коммит
a16162832b
@ -15,6 +15,7 @@
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -51,7 +52,7 @@ static void out(char *str, char *arg);
|
||||
|
||||
|
||||
void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
|
||||
int *error_code, ...)
|
||||
int *error_code, ...)
|
||||
{
|
||||
char *name;
|
||||
struct ompi_communicator_t *abort_comm;
|
||||
@ -72,7 +73,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
|
||||
|
||||
|
||||
void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
|
||||
int *error_code, ...)
|
||||
int *error_code, ...)
|
||||
{
|
||||
char *name;
|
||||
struct ompi_communicator_t *abort_comm;
|
||||
@ -93,7 +94,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
|
||||
|
||||
|
||||
void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
|
||||
int *error_code, ...)
|
||||
int *error_code, ...)
|
||||
{
|
||||
char *name;
|
||||
struct ompi_communicator_t *abort_comm = NULL;
|
||||
@ -111,7 +112,7 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
|
||||
}
|
||||
|
||||
void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
|
||||
int *error_code, ...)
|
||||
int *error_code, ...)
|
||||
{
|
||||
/* Don't need anything more -- just need this function to exist */
|
||||
/* Silence some compiler warnings */
|
||||
@ -123,7 +124,7 @@ void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
|
||||
|
||||
|
||||
void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
|
||||
int *error_code, ...)
|
||||
int *error_code, ...)
|
||||
{
|
||||
/* Don't need anything more -- just need this function to exist */
|
||||
/* Silence some compiler warnings */
|
||||
@ -135,7 +136,7 @@ void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
|
||||
|
||||
|
||||
void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
|
||||
int *error_code, ...)
|
||||
int *error_code, ...)
|
||||
{
|
||||
/* Don't need anything more -- just need this function to exist */
|
||||
/* Silence some compiler warnings */
|
||||
@ -181,6 +182,7 @@ static void backend_fatal_aggregate(char *type,
|
||||
const char* const unknown_error_code = "Error code: %d (no associated error message)";
|
||||
const char* const unknown_error = "Unknown error";
|
||||
const char* const unknown_prefix = "[?:?]";
|
||||
bool generated = false;
|
||||
|
||||
// these do not own what they point to; they're
|
||||
// here to avoid repeating expressions such as
|
||||
@ -209,6 +211,8 @@ static void backend_fatal_aggregate(char *type,
|
||||
err_msg = NULL;
|
||||
opal_output(0, "%s", "Could not write to err_msg");
|
||||
opal_output(0, unknown_error_code, *error_code);
|
||||
} else {
|
||||
generated = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -254,7 +258,9 @@ static void backend_fatal_aggregate(char *type,
|
||||
}
|
||||
|
||||
free(prefix);
|
||||
free(err_msg);
|
||||
if (generated) {
|
||||
free(err_msg);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -4,6 +4,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2015 INRIA. All rights reserved.
|
||||
* Copyright (c) 2011-2015 Université Bordeaux 1
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -61,9 +62,6 @@ mca_topo_treematch_component_2_2_0_t mca_topo_treematch_component =
|
||||
|
||||
static int init_query(bool enable_progress_threads, bool enable_mpi_threads)
|
||||
{
|
||||
if(NULL == opal_hwloc_topology) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -97,4 +95,3 @@ static int mca_topo_treematch_component_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &mca_topo_treematch_component.reorder_mode);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2015 INRIA. All rights reserved.
|
||||
* Copyright (c) 2012-2015 Bordeaux Poytechnic Institute
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2016 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
|
||||
@ -256,7 +256,9 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
|
||||
/* Then, we need to know if the processes are bound */
|
||||
/* We make the hypothesis that all processes are in */
|
||||
/* the same state : all bound or none bound */
|
||||
assert(NULL != opal_hwloc_topology);
|
||||
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
|
||||
goto fallback;
|
||||
}
|
||||
root_obj = hwloc_get_root_obj(opal_hwloc_topology);
|
||||
if (NULL == root_obj) goto fallback;
|
||||
|
||||
@ -873,7 +875,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
|
||||
if( -1 == hwloc_err) goto fallback;
|
||||
|
||||
/* Report new binding to ORTE/OPAL */
|
||||
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
|
||||
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
|
||||
err = hwloc_bitmap_snprintf (set_as_string,64,set);
|
||||
|
||||
#ifdef __DEBUG__
|
||||
|
@ -508,16 +508,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
/* check for timing request - get stop time and report elapsed time if so */
|
||||
OPAL_TIMING_MNEXT((&tm,"time from completion of rte_init to modex"));
|
||||
|
||||
/* if hwloc is available but didn't get setup for some
|
||||
* reason, do so now
|
||||
*/
|
||||
if (NULL == opal_hwloc_topology) {
|
||||
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
|
||||
error = "Topology init";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
/* Register the default errhandler callback */
|
||||
errtrk.status = OPAL_ERROR;
|
||||
errtrk.active = true;
|
||||
|
@ -18,7 +18,7 @@
|
||||
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
|
||||
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014 Bull SAS. All rights reserved.
|
||||
@ -1502,13 +1502,33 @@ static uint64_t read_module_param(char *file, uint64_t value, uint64_t max)
|
||||
static uint64_t calculate_total_mem (void)
|
||||
{
|
||||
hwloc_obj_t machine;
|
||||
int rc;
|
||||
uint64_t mem, *mptr;
|
||||
opal_process_name_t wildcard_rank;
|
||||
|
||||
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
|
||||
if (NULL == machine) {
|
||||
return 0;
|
||||
/* first try to retrieve it from PMIx as it may have
|
||||
* been provided */
|
||||
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
|
||||
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
|
||||
mptr = &mem;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_AVAIL_PHYS_MEMORY,
|
||||
&wildcard_rank, &mptr, OPAL_UINT64);
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
return mem;
|
||||
}
|
||||
|
||||
return machine->memory.total_memory;
|
||||
/* if not available, then ensure that the topology has been
|
||||
* loaded and try to get it from there */
|
||||
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
|
||||
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
|
||||
if (NULL == machine) {
|
||||
return 0;
|
||||
}
|
||||
return machine->memory.total_memory;
|
||||
}
|
||||
|
||||
/* couldn't find it */
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -2312,7 +2332,8 @@ static float get_ib_dev_distance(struct ibv_device *dev)
|
||||
float distance = 0;
|
||||
|
||||
/* Override any distance logic so all devices are used */
|
||||
if (0 != mca_btl_openib_component.ignore_locality) {
|
||||
if (0 != mca_btl_openib_component.ignore_locality ||
|
||||
OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
|
||||
return distance;
|
||||
}
|
||||
|
||||
|
@ -52,7 +52,7 @@
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/printf.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
#include "opal/mca/shmem/base/base.h"
|
||||
#include "opal/mca/shmem/shmem.h"
|
||||
|
||||
@ -242,7 +242,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
|
||||
free(loc);
|
||||
} else {
|
||||
/* If we have hwloc support, then get accurate information */
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
|
||||
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_AVAILABLE);
|
||||
@ -257,6 +257,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
|
||||
}
|
||||
}
|
||||
/* see if we were given our location */
|
||||
loc = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
|
||||
&OPAL_PROC_MY_NAME, &loc, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
@ -283,8 +284,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
|
||||
}
|
||||
} else {
|
||||
/* If we have hwloc support, then get accurate information */
|
||||
if (NULL != opal_hwloc_topology && num_mem_nodes > 0 &&
|
||||
NULL != opal_process_info.cpuset) {
|
||||
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && num_mem_nodes > 0) {
|
||||
int numa=0, w;
|
||||
unsigned n_bound=0;
|
||||
hwloc_cpuset_t avail;
|
||||
|
@ -18,7 +18,7 @@
|
||||
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -48,6 +48,7 @@
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/printf.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
#include "opal/mca/shmem/base/base.h"
|
||||
#include "opal/mca/shmem/shmem.h"
|
||||
#include "opal/datatype/opal_convertor.h"
|
||||
@ -232,23 +233,28 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
|
||||
int my_mem_node, num_mem_nodes, i, rc;
|
||||
mca_common_sm_mpool_resources_t *res = NULL;
|
||||
mca_btl_smcuda_component_t* m = &mca_btl_smcuda_component;
|
||||
char *loc, *mynuma;
|
||||
opal_process_name_t wildcard_rank;
|
||||
|
||||
/* Assume we don't have hwloc support and fill in dummy info */
|
||||
mca_btl_smcuda_component.mem_node = my_mem_node = 0;
|
||||
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = 1;
|
||||
|
||||
/* If we have hwloc support, then get accurate information */
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_AVAILABLE);
|
||||
|
||||
/* If we find >0 NUMA nodes, then investigate further */
|
||||
if (i > 0) {
|
||||
int numa=0, w;
|
||||
unsigned n_bound=0;
|
||||
hwloc_cpuset_t avail;
|
||||
hwloc_obj_t obj;
|
||||
/* see if we were given a topology signature */
|
||||
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
|
||||
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_TOPOLOGY_SIGNATURE,
|
||||
&wildcard_rank, &loc, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
/* the number of NUMA nodes is right at the front */
|
||||
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = strtoul(loc, NULL, 10);
|
||||
free(loc);
|
||||
} else {
|
||||
/* If we have hwloc support, then get accurate information */
|
||||
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
|
||||
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
|
||||
HWLOC_OBJ_NODE, 0,
|
||||
OPAL_HWLOC_AVAILABLE);
|
||||
|
||||
/* JMS This tells me how many numa nodes are *available*,
|
||||
but it's not how many are being used *by this job*.
|
||||
@ -257,33 +263,65 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
|
||||
should be improved to be how many NUMA nodes are being
|
||||
used *in this job*. */
|
||||
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = i;
|
||||
}
|
||||
}
|
||||
/* see if we were given our location */
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
|
||||
&OPAL_PROC_MY_NAME, &loc, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == rc) {
|
||||
if (NULL == loc) {
|
||||
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
|
||||
} else {
|
||||
/* get our NUMA location */
|
||||
mynuma = opal_hwloc_base_get_location(loc, HWLOC_OBJ_NODE, 0);
|
||||
if (NULL == mynuma ||
|
||||
NULL != strchr(mynuma, ',') ||
|
||||
NULL != strchr(mynuma, '-')) {
|
||||
/* we either have no idea what NUMA we are on, or we
|
||||
* are on multiple NUMA nodes */
|
||||
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
|
||||
} else {
|
||||
/* we are bound to a single NUMA node */
|
||||
my_mem_node = strtoul(mynuma, NULL, 10);
|
||||
mca_btl_smcuda_component.mem_node = my_mem_node;
|
||||
}
|
||||
if (NULL != mynuma) {
|
||||
free(mynuma);
|
||||
}
|
||||
free(loc);
|
||||
}
|
||||
} else {
|
||||
/* If we have hwloc support, then get accurate information */
|
||||
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() &&
|
||||
num_mem_nodes > 0 && NULL != opal_process_info.cpuset) {
|
||||
int numa=0, w;
|
||||
unsigned n_bound=0;
|
||||
hwloc_cpuset_t avail;
|
||||
hwloc_obj_t obj;
|
||||
|
||||
/* if we are not bound, then there is nothing further to do */
|
||||
if (NULL != opal_process_info.cpuset) {
|
||||
/* count the number of NUMA nodes to which we are bound */
|
||||
for (w=0; w < i; w++) {
|
||||
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
|
||||
HWLOC_OBJ_NODE, 0, w,
|
||||
OPAL_HWLOC_AVAILABLE))) {
|
||||
continue;
|
||||
}
|
||||
/* get that NUMA node's available cpus */
|
||||
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
|
||||
/* see if we intersect */
|
||||
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
|
||||
n_bound++;
|
||||
numa = w;
|
||||
}
|
||||
/* count the number of NUMA nodes to which we are bound */
|
||||
for (w=0; w < i; w++) {
|
||||
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
|
||||
HWLOC_OBJ_NODE, 0, w,
|
||||
OPAL_HWLOC_AVAILABLE))) {
|
||||
continue;
|
||||
}
|
||||
/* if we are located on more than one NUMA, or we didn't find
|
||||
* a NUMA we are on, then not much we can do
|
||||
*/
|
||||
if (1 == n_bound) {
|
||||
mca_btl_smcuda_component.mem_node = my_mem_node = numa;
|
||||
} else {
|
||||
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
|
||||
/* get that NUMA node's available cpus */
|
||||
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
|
||||
/* see if we intersect */
|
||||
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
|
||||
n_bound++;
|
||||
numa = w;
|
||||
}
|
||||
}
|
||||
/* if we are located on more than one NUMA, or we didn't find
|
||||
* a NUMA we are on, then not much we can do
|
||||
*/
|
||||
if (1 == n_bound) {
|
||||
mca_btl_smcuda_component.mem_node = my_mem_node = numa;
|
||||
} else {
|
||||
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -431,7 +469,7 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
|
||||
mca_btl_smcuda_component.sm_free_list_inc,
|
||||
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
|
||||
if ( OPAL_SUCCESS != i )
|
||||
return i;
|
||||
return i;
|
||||
|
||||
mca_btl_smcuda_component.num_outstanding_frags = 0;
|
||||
|
||||
@ -1120,8 +1158,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
|
||||
mca_common_wait_stream_synchronize(&rget_reg);
|
||||
|
||||
rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
|
||||
"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
|
||||
&done);
|
||||
"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
|
||||
&done);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
/* Out of resources can be handled by upper layers. */
|
||||
if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -9,7 +10,7 @@
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/constants.h"
|
||||
|
||||
#if BTL_IN_OPAL
|
||||
@ -191,6 +192,13 @@ int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module)
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:filter_numa: filtering devices by NUMA distance");
|
||||
|
||||
/* ensure we have the topology */
|
||||
if (OPAL_SUCCESS !=- opal_hwloc_base_get_topology()) {
|
||||
opal_output_verbose(5, USNIC_OUT,
|
||||
"btl:usnic:filter_numa: not sorting devices by NUMA distance (topology not available)");
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/* Get the hwloc distance matrix for all NUMA nodes */
|
||||
if (OPAL_SUCCESS != (ret = get_distance_matrix())) {
|
||||
return ret;
|
||||
|
@ -40,6 +40,7 @@
|
||||
#include "opal/util/os_dirpath.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/threads/tsd.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
@ -240,12 +241,65 @@ static void fill_cache_line_size(void)
|
||||
|
||||
int opal_hwloc_base_get_topology(void)
|
||||
{
|
||||
int rc=OPAL_SUCCESS;
|
||||
int rc;
|
||||
opal_process_name_t wildcard_rank;
|
||||
char *val = NULL;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
|
||||
OPAL_OUTPUT_VERBOSE((2, opal_hwloc_base_framework.framework_output,
|
||||
"hwloc:base:get_topology"));
|
||||
|
||||
if (NULL == opal_hwloc_base_topo_file) {
|
||||
/* see if we already got it */
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (NULL != opal_pmix.get) {
|
||||
/* try to retrieve it from the PMIx store */
|
||||
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
|
||||
"hwloc:base instantiating topology");
|
||||
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
|
||||
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCAL_TOPO,
|
||||
&wildcard_rank, &val, OPAL_STRING);
|
||||
} else {
|
||||
rc = OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
if (OPAL_SUCCESS == rc && NULL != val) {
|
||||
/* load the topology */
|
||||
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
|
||||
free(val);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) {
|
||||
free(val);
|
||||
hwloc_topology_destroy(opal_hwloc_topology);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
/* since we are loading this from an external source, we have to
|
||||
* explicitly set a flag so hwloc sets things up correctly
|
||||
*/
|
||||
if (0 != hwloc_topology_set_flags(opal_hwloc_topology,
|
||||
(HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
|
||||
HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
|
||||
HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
|
||||
hwloc_topology_destroy(opal_hwloc_topology);
|
||||
free(val);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
/* now load the topology */
|
||||
if (0 != hwloc_topology_load(opal_hwloc_topology)) {
|
||||
hwloc_topology_destroy(opal_hwloc_topology);
|
||||
free(val);
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
free(val);
|
||||
/* filter the cpus thru any default cpu set */
|
||||
if (OPAL_SUCCESS != (rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
|
||||
hwloc_topology_destroy(opal_hwloc_topology);
|
||||
return rc;
|
||||
}
|
||||
} else if (NULL == opal_hwloc_base_topo_file) {
|
||||
if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
|
||||
0 != hwloc_topology_set_flags(opal_hwloc_topology,
|
||||
(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
|
||||
@ -266,7 +320,12 @@ int opal_hwloc_base_get_topology(void)
|
||||
line size */
|
||||
fill_cache_line_size();
|
||||
|
||||
return rc;
|
||||
/* get or update our local cpuset - it will get used multiple
|
||||
* times, so it's more efficient to keep a global copy
|
||||
*/
|
||||
opal_hwloc_base_get_local_cpuset();
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
int opal_hwloc_base_set_topology(char *topofile)
|
||||
|
@ -106,6 +106,7 @@ BEGIN_C_DECLS
|
||||
#define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs
|
||||
#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string
|
||||
#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location
|
||||
#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node
|
||||
|
||||
#define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace
|
||||
#define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -56,7 +56,8 @@ int orte_ess_base_proc_binding(void)
|
||||
char *error=NULL;
|
||||
hwloc_cpuset_t mycpus;
|
||||
|
||||
/* Determine if we were pre-bound or not */
|
||||
/* Determine if we were pre-bound or not - this also indicates
|
||||
* that we were launched via mpirun, bound or not */
|
||||
if (NULL != getenv(OPAL_MCA_PREFIX"orte_bound_at_launch")) {
|
||||
orte_proc_is_bound = true;
|
||||
if (NULL != (map = getenv(OPAL_MCA_PREFIX"orte_base_applied_binding"))) {
|
||||
@ -66,21 +67,49 @@ int orte_ess_base_proc_binding(void)
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
|
||||
/* print out a shorthand notation to avoid pulling in the entire topology tree */
|
||||
map = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
|
||||
ORTE_PROC_MY_NAME, &map, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != map) {
|
||||
opal_output(0, "MCW rank %s bound to %s",
|
||||
ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map);
|
||||
free(map);
|
||||
} else {
|
||||
opal_output(0, "MCW rank %s not bound", ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid));
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
} else if (NULL != getenv(OPAL_MCA_PREFIX"orte_externally_bound")) {
|
||||
orte_proc_is_bound = true;
|
||||
/* see if we were launched by a PMIx-enabled system */
|
||||
map = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
|
||||
ORTE_PROC_MY_NAME, &map, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != map) {
|
||||
/* we were - no need to pull in the topology */
|
||||
if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
|
||||
opal_output(0, "MCW rank %s bound to %s",
|
||||
ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map);
|
||||
}
|
||||
free(map);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* the topology system will pickup the binding pattern */
|
||||
}
|
||||
|
||||
/* load the topology as we will likely need it */
|
||||
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
|
||||
/* there is nothing we can do, so just return */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* see if we were bound when launched */
|
||||
if (!orte_proc_is_bound) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output,
|
||||
"%s Not bound at launch",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
/* we were not bound at launch */
|
||||
if (NULL == opal_hwloc_topology) {
|
||||
/* there is nothing we can do, so just return */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
|
||||
/* get our node object */
|
||||
node = hwloc_get_root_obj(opal_hwloc_topology);
|
||||
@ -257,11 +286,6 @@ int orte_ess_base_proc_binding(void)
|
||||
}
|
||||
|
||||
MOVEON:
|
||||
/* get or update our local cpuset - it will get used multiple
|
||||
* times, so it's more efficient to keep a global copy
|
||||
*/
|
||||
opal_hwloc_base_get_local_cpuset();
|
||||
|
||||
/* get the cpus we are bound to */
|
||||
mycpus = hwloc_bitmap_alloc();
|
||||
if (hwloc_get_cpubind(opal_hwloc_topology,
|
||||
|
@ -302,75 +302,6 @@ static int rte_init(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* retrieve our topology */
|
||||
val = NULL;
|
||||
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO,
|
||||
&wildcard_rank, &val, OPAL_STRING);
|
||||
if (OPAL_SUCCESS == ret && NULL != val) {
|
||||
/* load the topology */
|
||||
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
|
||||
ret = OPAL_ERROR;
|
||||
free(val);
|
||||
error = "setting topology";
|
||||
goto error;
|
||||
}
|
||||
if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) {
|
||||
ret = OPAL_ERROR;
|
||||
free(val);
|
||||
hwloc_topology_destroy(opal_hwloc_topology);
|
||||
error = "setting topology";
|
||||
goto error;
|
||||
}
|
||||
/* since we are loading this from an external source, we have to
|
||||
* explicitly set a flag so hwloc sets things up correctly
|
||||
*/
|
||||
if (0 != hwloc_topology_set_flags(opal_hwloc_topology,
|
||||
(HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
|
||||
HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
|
||||
HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
|
||||
ret = OPAL_ERROR;
|
||||
hwloc_topology_destroy(opal_hwloc_topology);
|
||||
free(val);
|
||||
error = "setting topology";
|
||||
goto error;
|
||||
}
|
||||
/* now load the topology */
|
||||
if (0 != hwloc_topology_load(opal_hwloc_topology)) {
|
||||
ret = OPAL_ERROR;
|
||||
hwloc_topology_destroy(opal_hwloc_topology);
|
||||
free(val);
|
||||
error = "setting topology";
|
||||
goto error;
|
||||
}
|
||||
free(val);
|
||||
/* filter the cpus thru any default cpu set */
|
||||
if (OPAL_SUCCESS != (ret = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
|
||||
error = "filtering topology";
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
/* it wasn't passed down to us, so go get it */
|
||||
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
|
||||
error = "topology discovery";
|
||||
goto error;
|
||||
}
|
||||
/* push it into the PMIx database in case someone
|
||||
* tries to retrieve it so we avoid an attempt to
|
||||
* get it again */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCAL_TOPO);
|
||||
kv->type = OPAL_STRING;
|
||||
if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) {
|
||||
error = "topology export";
|
||||
goto error;
|
||||
}
|
||||
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&wildcard_rank, kv))) {
|
||||
error = "topology store";
|
||||
goto error;
|
||||
}
|
||||
OBJ_RELEASE(kv);
|
||||
}
|
||||
|
||||
/* get our local peers */
|
||||
if (0 < orte_process_info.num_local_peers) {
|
||||
/* if my local rank if too high, then that's an error */
|
||||
|
@ -190,6 +190,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
/* if the user specified a default binding policy via
|
||||
* MCA param, then we use it - this can include a directive
|
||||
* to overload */
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps[%d] binding policy given", __LINE__);
|
||||
jdata->map->binding = opal_hwloc_binding_policy;
|
||||
} else if (1 < jdata->map->cpus_per_rank) {
|
||||
/* bind to cpus */
|
||||
@ -238,6 +240,26 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps[%d] binding not given - using bynuma", __LINE__);
|
||||
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA);
|
||||
} else {
|
||||
/* we are mapping by node or some other non-object method */
|
||||
if (nprocs <= 2) {
|
||||
if (opal_hwloc_use_hwthreads_as_cpus) {
|
||||
/* if we are using hwthread cpus, then bind to those */
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps[%d] binding not given - using byhwthread", __LINE__);
|
||||
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_HWTHREAD);
|
||||
} else {
|
||||
/* for performance, bind to core */
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps[%d] binding not given - using bycore", __LINE__);
|
||||
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CORE);
|
||||
}
|
||||
} else {
|
||||
/* for performance, bind to NUMA */
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps[%d] binding not given - using bynuma", __LINE__);
|
||||
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA);
|
||||
}
|
||||
}
|
||||
} else if (nprocs <= 2) {
|
||||
if (opal_hwloc_use_hwthreads_as_cpus) {
|
||||
|
@ -67,6 +67,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
uid_t uid;
|
||||
gid_t gid;
|
||||
opal_list_t *cache;
|
||||
hwloc_obj_t machine;
|
||||
|
||||
opal_output_verbose(2, orte_pmix_server_globals.output,
|
||||
"%s register nspace for %s",
|
||||
@ -247,6 +248,16 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
kv->data.string = strdup(orte_topo_signature);
|
||||
opal_list_append(info, &kv->super);
|
||||
|
||||
/* total available physical memory */
|
||||
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
|
||||
if (NULL != machine) {
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_AVAIL_PHYS_MEMORY);
|
||||
kv->type = OPAL_UINT64;
|
||||
kv->data.uint64 = machine->memory.total_memory;
|
||||
opal_list_append(info, &kv->super);
|
||||
}
|
||||
|
||||
/* register any local clients */
|
||||
vpid = ORTE_VPID_MAX;
|
||||
micro = NULL;
|
||||
@ -328,44 +339,53 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
kv->data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, tmp);
|
||||
opal_list_append(pmap, &kv->super);
|
||||
free(tmp);
|
||||
} else {
|
||||
/* the proc is not bound */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_LOCALITY_STRING);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = NULL;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
}
|
||||
}
|
||||
|
||||
/* appnum */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_APPNUM);
|
||||
kv->type = OPAL_UINT32;
|
||||
kv->data.uint32 = pptr->app_idx;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
if (1 < jdata->num_apps) {
|
||||
/* appnum */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_APPNUM);
|
||||
kv->type = OPAL_UINT32;
|
||||
kv->data.uint32 = pptr->app_idx;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
|
||||
/* app ldr */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_APPLDR);
|
||||
kv->type = OPAL_VPID;
|
||||
kv->data.name.vpid = app->first_rank;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
/* app ldr */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_APPLDR);
|
||||
kv->type = OPAL_VPID;
|
||||
kv->data.name.vpid = app->first_rank;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
|
||||
/* global/univ rank */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
|
||||
kv->type = OPAL_VPID;
|
||||
kv->data.name.vpid = pptr->name.vpid + jdata->offset;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
/* global/univ rank */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
|
||||
kv->type = OPAL_VPID;
|
||||
kv->data.name.vpid = pptr->name.vpid + jdata->offset;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
|
||||
/* app rank */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_APP_RANK);
|
||||
kv->type = OPAL_VPID;
|
||||
kv->data.name.vpid = pptr->app_rank;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
/* app rank */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_APP_RANK);
|
||||
kv->type = OPAL_VPID;
|
||||
kv->data.name.vpid = pptr->app_rank;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
|
||||
/* app size */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_APP_SIZE);
|
||||
kv->type = OPAL_UINT32;
|
||||
kv->data.uint32 = app->num_procs;
|
||||
opal_list_append(info, &kv->super);
|
||||
/* app size */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_APP_SIZE);
|
||||
kv->type = OPAL_UINT32;
|
||||
kv->data.uint32 = app->num_procs;
|
||||
opal_list_append(info, &kv->super);
|
||||
}
|
||||
|
||||
/* local rank */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
@ -381,13 +401,6 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
|
||||
kv->data.uint32 = pptr->node_rank;
|
||||
opal_list_append(pmap, &kv->super);
|
||||
|
||||
/* hostname */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_HOSTNAME);
|
||||
kv->type = OPAL_STRING;
|
||||
kv->data.string = strdup(pptr->node->name);
|
||||
opal_list_append(pmap, &kv->super);
|
||||
|
||||
/* node ID */
|
||||
kv = OBJ_NEW(opal_value_t);
|
||||
kv->key = strdup(OPAL_PMIX_NODEID);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user