2011-09-11 23:02:24 +04:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef OPAL_HWLOC_BASE_H
|
|
|
|
#define OPAL_HWLOC_BASE_H
|
|
|
|
|
|
|
|
#include "opal_config.h"
|
|
|
|
|
|
|
|
#include "opal/dss/dss_types.h"
|
2011-10-20 00:18:14 +04:00
|
|
|
#include "opal/mca/paffinity/paffinity.h"
|
2011-09-11 23:02:24 +04:00
|
|
|
|
|
|
|
#include "opal/mca/hwloc/hwloc.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global functions for MCA overall hwloc open and close
|
|
|
|
*/
|
|
|
|
|
|
|
|
BEGIN_C_DECLS
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Initialize the hwloc MCA framework
|
|
|
|
*
|
|
|
|
* @retval OPAL_SUCCESS Upon success
|
|
|
|
* @retval OPAL_ERROR Upon failure
|
|
|
|
*
|
|
|
|
* This must be the first function invoked in the hwloc MCA
|
|
|
|
* framework. It initializes the hwloc MCA framework, finds
|
|
|
|
* and opens hwloc components, etc.
|
|
|
|
*
|
|
|
|
* This function is invoked during opal_init().
|
|
|
|
*
|
|
|
|
* This function fills in the internal global variable
|
|
|
|
* opal_hwloc_base_components_opened, which is a list of all
|
|
|
|
* hwloc components that were successfully opened. This
|
|
|
|
* variable should \em only be used by other hwloc base
|
|
|
|
* functions -- it is not considered a public interface member --
|
|
|
|
* and is only mentioned here for completeness.
|
2011-09-19 20:10:37 +04:00
|
|
|
*
|
|
|
|
* Note that this function does NOT fill the global variable
|
|
|
|
* opal_hwloc_topology, nor does it set the process-wide memory
|
|
|
|
* affinity policy. Filling opal_hwloc_topology via
|
|
|
|
* hwloc_topology_load() can be expensive (and/or serialized by the
|
|
|
|
* OS); it may not be desireable to call this function in every MPI
|
|
|
|
* process on a machine. Hence, it is the responsibility for an upper
|
|
|
|
* layer to both fill opal_hwloc_topology in some scalable way, as
|
|
|
|
* well as to invoke opal_hwloc_base_set_process_membind_policy()
|
|
|
|
* (after opal_hwloc_topology has been loaded) to set the process-wide
|
|
|
|
* memory affinity policy.
|
2011-09-11 23:02:24 +04:00
|
|
|
*/
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_base_open(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Shut down the hwloc MCA framework.
|
|
|
|
*
|
|
|
|
* @retval OPAL_SUCCESS Always
|
|
|
|
*
|
|
|
|
* This function shuts down everything in the hwloc MCA
|
|
|
|
* framework, and is called during opal_finalize().
|
|
|
|
*
|
|
|
|
* It must be the last function invoked on the hwloc MCA
|
|
|
|
* framework.
|
|
|
|
*/
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_base_close(void);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Debugging output stream
|
|
|
|
*/
|
|
|
|
OPAL_DECLSPEC extern int opal_hwloc_base_output;
|
2011-09-17 15:54:36 +04:00
|
|
|
OPAL_DECLSPEC extern opal_list_t opal_hwloc_base_components;
|
2011-09-11 23:02:24 +04:00
|
|
|
OPAL_DECLSPEC extern bool opal_hwloc_base_inited;
|
|
|
|
OPAL_DECLSPEC extern bool opal_hwloc_topology_inited;
|
|
|
|
|
|
|
|
#if OPAL_HAVE_HWLOC
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
OPAL_DECLSPEC extern char *opal_hwloc_base_slot_list;
|
2011-10-29 18:58:58 +04:00
|
|
|
OPAL_DECLSPEC extern char *opal_hwloc_base_cpu_set;
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
OPAL_DECLSPEC extern hwloc_cpuset_t opal_hwloc_base_given_cpus;
|
2011-09-19 20:10:37 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Report a bind failure using the normal mechanisms if a component
|
|
|
|
* fails to bind memory -- according to the value of the
|
|
|
|
* hwloc_base_bind_failure_action MCA parameter.
|
|
|
|
*/
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_base_report_bind_failure(const char *file,
|
|
|
|
int line,
|
|
|
|
const char *msg,
|
|
|
|
int rc);
|
|
|
|
|
2011-10-20 00:18:14 +04:00
|
|
|
OPAL_DECLSPEC opal_paffinity_locality_t opal_hwloc_base_get_relative_locality(hwloc_topology_t topo,
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
opal_hwloc_level_t level1,
|
|
|
|
unsigned int peer1,
|
|
|
|
opal_hwloc_level_t level2,
|
|
|
|
unsigned int peer2);
|
2011-10-20 00:18:14 +04:00
|
|
|
|
|
|
|
OPAL_DECLSPEC void opal_hwloc_base_get_local_cpuset(void);
|
|
|
|
|
2011-09-19 20:10:37 +04:00
|
|
|
/**
|
|
|
|
* Enum for what memory allocation policy we want for user allocations.
|
|
|
|
* MAP = memory allocation policy.
|
|
|
|
*/
|
|
|
|
typedef enum {
|
|
|
|
OPAL_HWLOC_BASE_MAP_NONE,
|
|
|
|
OPAL_HWLOC_BASE_MAP_LOCAL_ONLY
|
|
|
|
} opal_hwloc_base_map_t;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Global reflecting the MAP (set by MCA param).
|
|
|
|
*/
|
|
|
|
OPAL_DECLSPEC extern opal_hwloc_base_map_t opal_hwloc_base_map;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Enum for what to do if the hwloc framework tries to bind memory
|
|
|
|
* and fails. BFA = bind failure action.
|
|
|
|
*/
|
|
|
|
typedef enum {
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
OPAL_HWLOC_BASE_MBFA_SILENT,
|
2011-10-12 20:07:09 +04:00
|
|
|
OPAL_HWLOC_BASE_MBFA_WARN,
|
|
|
|
OPAL_HWLOC_BASE_MBFA_ERROR
|
|
|
|
} opal_hwloc_base_mbfa_t;
|
2011-09-19 20:10:37 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Global reflecting the BFA (set by MCA param).
|
|
|
|
*/
|
2011-10-12 20:07:09 +04:00
|
|
|
OPAL_DECLSPEC extern opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa;
|
2011-09-19 20:10:37 +04:00
|
|
|
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
/* some critical helper functions */
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_base_filter_cpus(hwloc_topology_t topo);
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_base_get_topology(void);
|
|
|
|
OPAL_DECLSPEC void opal_hwloc_base_free_topology(hwloc_topology_t topo);
|
|
|
|
OPAL_DECLSPEC hwloc_cpuset_t opal_hwloc_base_get_available_cpus(hwloc_topology_t topo,
|
|
|
|
hwloc_obj_t obj);
|
|
|
|
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo,
|
|
|
|
hwloc_obj_type_t target,
|
|
|
|
unsigned cache_level,
|
|
|
|
opal_hwloc_resource_type_t rtype);
|
|
|
|
OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_obj_by_type(hwloc_topology_t topo,
|
|
|
|
hwloc_obj_type_t target,
|
|
|
|
unsigned cache_level,
|
|
|
|
unsigned int instance,
|
|
|
|
opal_hwloc_resource_type_t rtype);
|
|
|
|
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_obj_idx(hwloc_topology_t topo,
|
|
|
|
hwloc_obj_t obj,
|
|
|
|
opal_hwloc_resource_type_t rtype);
|
|
|
|
OPAL_DECLSPEC void opal_hwloc_base_get_level_and_index(hwloc_cpuset_t cpus,
|
|
|
|
opal_hwloc_level_t *bind_level,
|
|
|
|
unsigned int *bind_idx);
|
|
|
|
OPAL_DECLSPEC unsigned int opal_hwloc_base_get_npus(hwloc_topology_t topo,
|
|
|
|
hwloc_obj_t target);
|
|
|
|
OPAL_DECLSPEC char* opal_hwloc_base_print_binding(opal_binding_policy_t binding);
|
|
|
|
OPAL_DECLSPEC char* opal_hwloc_base_print_locality(opal_paffinity_locality_t locality);
|
|
|
|
OPAL_DECLSPEC char* opal_hwloc_base_print_level(opal_hwloc_level_t level);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Provide a utility to parse a slot list against the local
|
|
|
|
* logical cpus, and produce a cpuset for the described binding
|
|
|
|
*/
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_base_slot_list_parse(const char *slot_str,
|
|
|
|
hwloc_topology_t topo,
|
|
|
|
hwloc_cpuset_t cpumask);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Report a bind failure using the normal mechanisms if a component
|
|
|
|
* fails to bind memory -- according to the value of the
|
|
|
|
* hwloc_base_bind_failure_action MCA parameter.
|
|
|
|
*/
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_base_report_bind_failure(const char *file,
|
|
|
|
int line,
|
|
|
|
const char *msg,
|
|
|
|
int rc);
|
|
|
|
|
2011-09-19 20:10:37 +04:00
|
|
|
/**
|
|
|
|
* This function sets the process-wide memory affinity policy
|
2011-10-12 20:07:09 +04:00
|
|
|
* according to opal_hwloc_base_map and opal_hwloc_base_mbfa. It needs
|
2011-09-19 20:10:37 +04:00
|
|
|
* to be a separate, standalone function (as opposed to being done
|
|
|
|
* during opal_hwloc_base_open()) because opal_hwloc_topology is not
|
|
|
|
* loaded by opal_hwloc_base_open(). Hence, an upper layer needs to
|
|
|
|
* invoke this function after opal_hwloc_topology has been loaded.
|
|
|
|
*/
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_base_set_process_membind_policy(void);
|
|
|
|
|
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here:
https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement
The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation.
In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions:
1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior.
2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation.
3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so.
As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes.
This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
|
|
|
/* datatype support */
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_pack(opal_buffer_t *buffer, const void *src,
|
|
|
|
int32_t num_vals,
|
|
|
|
opal_data_type_t type);
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_unpack(opal_buffer_t *buffer, void *dest,
|
|
|
|
int32_t *num_vals,
|
|
|
|
opal_data_type_t type);
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_copy(hwloc_topology_t *dest,
|
|
|
|
hwloc_topology_t src,
|
|
|
|
opal_data_type_t type);
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_compare(const hwloc_topology_t topo1,
|
|
|
|
const hwloc_topology_t topo2,
|
|
|
|
opal_data_type_t type);
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_print(char **output, char *prefix,
|
|
|
|
hwloc_topology_t src,
|
|
|
|
opal_data_type_t type);
|
|
|
|
OPAL_DECLSPEC int opal_hwloc_size(size_t *size,
|
|
|
|
hwloc_topology_t src,
|
|
|
|
opal_data_type_t type);
|
|
|
|
OPAL_DECLSPEC void opal_hwloc_release(opal_dss_value_t *value);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2011-09-11 23:02:24 +04:00
|
|
|
END_C_DECLS
|
|
|
|
|
2011-09-19 20:10:37 +04:00
|
|
|
#endif /* OPAL_HWLOC_BASE_H */
|