1
1
openmpi/opal/mca/hwloc/base/hwloc_base_open.c

489 строки
20 KiB
C
Исходник Обычный вид История

/*
Refs trac:3275. We ran into a case where the OMPI SVN trunk grew a new acceptable MCA parameter value, but this new value was not accepted on the v1.6 branch (hwloc_base_mem_bind_failure_action -- on the trunk it accepts the value "silent", but on the older v1.6 branch, it doesn't). If you set "hwloc_base_mem_bind_failure_action=silent" in the default MCA params file and then accidentally ran with the v1.6 branch, every OMPI executable (including ompi_info) just failed because hwloc_base_open() would say "hey, 'silent' is not a valid value for hwloc_base_mem_bind_failure_action!". Kaboom. The only problem is that it didn't give you any indication of where this value was being set. Quite maddening, from a user perspective. So we changed the ompi_info handles this case. If any framework open function return OMPI_ERR_BAD_PARAM (either because its base MCA params got a bad value or because one of its component register/open functions return OMPI_ERR_BAD_PARAM), ompi_info will stop, print out a warning that it received and error, and then dump out the parameters that it has received so far in the framework that had a problem. At a minimum, this will show the user the MCA param that had an error (it's usually the last one), and ''where it was set from'' (so that they can go fix it). We updated ompi_info to check for O???_ERR_BAD_PARAM from each from the framework opens. Also updated the doxygen docs in mca.h for this O???_BAD_PARAM behavior. And we noticed that mca.h had MCA_SUCCESS and MCA_ERR_??? codes. Why? I think we used them in exactly one place in the code base (mca_base_components_open.c). So we deleted those and just used the normal OPAL_* codes instead. While we were doing this, we also cleaned up a little memory management during ompi_info/orte-info/opal-info finalization. Valgrind still reports a truckload of memory still in use at ompi_info termination, but they mostly look to be components not freeing memory/resources properly (and outside the scope of this fix). This commit was SVN r27306. The following Trac tickets were found above: Ticket 3275 --> https://svn.open-mpi.org/trac/ompi/ticket/3275
2012-09-12 00:47:24 +04:00
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/dss/dss.h"
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/threads/tsd.h"
#include "opal/mca/hwloc/hwloc.h"
#include "opal/mca/hwloc/base/base.h"
/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
* component's public mca_base_component_t struct.
*/
#include "opal/mca/hwloc/base/static-components.h"
/*
* Globals
*/
int opal_hwloc_base_output = -1;
opal_list_t opal_hwloc_base_components;
bool opal_hwloc_base_inited = false;
#if OPAL_HAVE_HWLOC
hwloc_topology_t opal_hwloc_topology=NULL;
hwloc_cpuset_t opal_hwloc_my_cpuset=NULL;
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
hwloc_cpuset_t opal_hwloc_base_given_cpus=NULL;
opal_hwloc_base_map_t opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_NONE;
opal_hwloc_base_mbfa_t opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_WARN;
opal_binding_policy_t opal_hwloc_binding_policy=0;
char *opal_hwloc_base_slot_list=NULL;
char *opal_hwloc_base_cpu_set=NULL;
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
bool opal_hwloc_report_bindings=false;
hwloc_obj_type_t opal_hwloc_levels[] = {
HWLOC_OBJ_MACHINE,
HWLOC_OBJ_NODE,
HWLOC_OBJ_SOCKET,
HWLOC_OBJ_CACHE,
HWLOC_OBJ_CACHE,
HWLOC_OBJ_CACHE,
HWLOC_OBJ_CORE,
HWLOC_OBJ_PU
};
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
bool opal_hwloc_use_hwthreads_as_cpus = false;
#endif
int opal_hwloc_base_open(void)
{
if (opal_hwloc_base_inited) {
return OPAL_SUCCESS;
}
opal_hwloc_base_inited = true;
#if OPAL_HAVE_HWLOC
{
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
int value, i;
opal_data_type_t tmp;
char *str_value;
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
char **tmpvals, **quals;
/* Debugging / verbose output */
mca_base_param_reg_int_name("hwloc", "base_verbose",
"Verbosity level of the hwloc framework",
false, false,
0, &value);
if (0 != value) {
opal_hwloc_base_output = opal_output_open(NULL);
} else {
opal_hwloc_base_output = -1;
}
/* hwloc_base_mbind_policy */
switch (opal_hwloc_base_map) {
case OPAL_HWLOC_BASE_MAP_NONE:
str_value = "none";
break;
case OPAL_HWLOC_BASE_MAP_LOCAL_ONLY:
str_value = "local_only";
break;
}
mca_base_param_reg_string_name("hwloc", "base_mem_alloc_policy",
"General memory allocations placement policy (this is not memory binding). "
"\"none\" means that no memory policy is applied. \"local_only\" means that a process' memory allocations will be restricted to its local NUMA node. "
"If using direct launch, this policy will not be in effect until after MPI_INIT. "
"Note that operating system paging policies are unaffected by this setting. For example, if \"local_only\" is used and local NUMA node memory is exhausted, a new memory allocation may cause paging.",
false, false, str_value, &str_value);
if (strcasecmp(str_value, "none") == 0) {
opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_NONE;
} else if (strcasecmp(str_value, "local_only") == 0 ||
strcasecmp(str_value, "local-only") == 0) {
opal_hwloc_base_map = OPAL_HWLOC_BASE_MAP_LOCAL_ONLY;
} else {
char hostname[32];
gethostname(hostname, sizeof(hostname));
opal_show_help("help-opal-hwloc-base.txt", "invalid mem_alloc_policy",
true, hostname, getpid(), str_value);
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
free(str_value);
return OPAL_ERR_BAD_PARAM;
}
free(str_value);
/* hwloc_base_bind_failure_action */
switch (opal_hwloc_base_mbfa) {
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
case OPAL_HWLOC_BASE_MBFA_SILENT:
str_value = "silent";
break;
case OPAL_HWLOC_BASE_MBFA_WARN:
str_value = "warn";
break;
case OPAL_HWLOC_BASE_MBFA_ERROR:
str_value = "error";
break;
}
mca_base_param_reg_string_name("hwloc", "base_mem_bind_failure_action",
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
"What Open MPI will do if it explicitly tries to bind memory to a specific NUMA location, and fails. Note that this is a different case than the general allocation policy described by hwloc_base_alloc_policy. A value of \"silent\" means that Open MPI will proceed without comment. A value of \"warn\" means that Open MPI will warn the first time this happens, but allow the job to continue (possibly with degraded performance). A value of \"error\" means that Open MPI will abort the job if this happens.",
false, false, str_value, &str_value);
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
if (strcasecmp(str_value, "silent") == 0) {
opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_SILENT;
} else if (strcasecmp(str_value, "warn") == 0) {
opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_WARN;
} else if (strcasecmp(str_value, "error") == 0) {
opal_hwloc_base_mbfa = OPAL_HWLOC_BASE_MBFA_ERROR;
} else {
char hostname[32];
gethostname(hostname, sizeof(hostname));
opal_show_help("help-opal-hwloc-base.txt", "invalid mem_bind_failure_action",
true, hostname, getpid(), str_value);
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
free(str_value);
return OPAL_ERR_BAD_PARAM;
}
free(str_value);
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
/* binding specification */
mca_base_param_reg_string_name("hwloc", "base_binding_policy",
"Policy for binding processes [none (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board] (supported qualifiers: overload-allowed,if-supported)",
false, false, NULL, &str_value);
if (NULL == str_value) {
opal_hwloc_binding_policy = OPAL_BIND_TO_NONE;
/* mark that no binding policy was specified */
opal_hwloc_binding_policy &= ~OPAL_BIND_GIVEN;
} else if (0 == strncasecmp(str_value, "none", strlen("none"))) {
opal_hwloc_binding_policy = OPAL_BIND_TO_NONE;
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
} else {
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
tmpvals = opal_argv_split(str_value, ':');
if (1 < opal_argv_count(tmpvals)) {
quals = opal_argv_split(tmpvals[1], ',');
for (i=0; NULL != quals[i]; i++) {
if (0 == strcasecmp(quals[i], "if-supported")) {
opal_hwloc_binding_policy |= OPAL_BIND_IF_SUPPORTED;
} else if (0 == strcasecmp(quals[i], "overload-allowed")) {
opal_hwloc_binding_policy |= OPAL_BIND_ALLOW_OVERLOAD;
} else {
/* unknown option */
opal_output(0, "Unknown qualifier to orte_process_binding: %s", str_value);
return OPAL_ERR_BAD_PARAM;
}
}
opal_argv_free(quals);
}
if (0 == strcasecmp(tmpvals[0], "hwthread")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
} else if (0 == strcasecmp(tmpvals[0], "core")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
} else if (0 == strcasecmp(tmpvals[0], "l1cache")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_L1CACHE);
} else if (0 == strcasecmp(tmpvals[0], "l2cache")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_L2CACHE);
} else if (0 == strcasecmp(tmpvals[0], "l3cache")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_L3CACHE);
} else if (0 == strcasecmp(tmpvals[0], "socket")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_SOCKET);
} else if (0 == strcasecmp(tmpvals[0], "numa")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_NUMA);
} else if (0 == strcasecmp(tmpvals[0], "board")) {
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_BOARD);
} else {
opal_show_help("help-opal-hwloc-base.txt", "invalid binding_policy", true, "binding", str_value);
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
opal_argv_free(tmpvals);
free(str_value);
return OPAL_ERR_BAD_PARAM;
}
opal_argv_free(tmpvals);
}
free(str_value);
/* backward compatibility */
mca_base_param_reg_int_name("hwloc", "base_bind_to_core",
"Bind processes to cores",
false, false, (int)false, &value);
if (value) {
/* set binding policy to core - error if something else already set */
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) != OPAL_BIND_TO_CORE) {
/* error - cannot redefine the default ranking policy */
opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
"core", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
Refs trac:3275. We ran into a case where the OMPI SVN trunk grew a new acceptable MCA parameter value, but this new value was not accepted on the v1.6 branch (hwloc_base_mem_bind_failure_action -- on the trunk it accepts the value "silent", but on the older v1.6 branch, it doesn't). If you set "hwloc_base_mem_bind_failure_action=silent" in the default MCA params file and then accidentally ran with the v1.6 branch, every OMPI executable (including ompi_info) just failed because hwloc_base_open() would say "hey, 'silent' is not a valid value for hwloc_base_mem_bind_failure_action!". Kaboom. The only problem is that it didn't give you any indication of where this value was being set. Quite maddening, from a user perspective. So we changed the ompi_info handles this case. If any framework open function return OMPI_ERR_BAD_PARAM (either because its base MCA params got a bad value or because one of its component register/open functions return OMPI_ERR_BAD_PARAM), ompi_info will stop, print out a warning that it received and error, and then dump out the parameters that it has received so far in the framework that had a problem. At a minimum, this will show the user the MCA param that had an error (it's usually the last one), and ''where it was set from'' (so that they can go fix it). We updated ompi_info to check for O???_ERR_BAD_PARAM from each from the framework opens. Also updated the doxygen docs in mca.h for this O???_BAD_PARAM behavior. And we noticed that mca.h had MCA_SUCCESS and MCA_ERR_??? codes. Why? I think we used them in exactly one place in the code base (mca_base_components_open.c). So we deleted those and just used the normal OPAL_* codes instead. While we were doing this, we also cleaned up a little memory management during ompi_info/orte-info/opal-info finalization. Valgrind still reports a truckload of memory still in use at ompi_info termination, but they mostly look to be components not freeing memory/resources properly (and outside the scope of this fix). This commit was SVN r27306. The following Trac tickets were found above: Ticket 3275 --> https://svn.open-mpi.org/trac/ompi/ticket/3275
2012-09-12 00:47:24 +04:00
return OPAL_ERR_BAD_PARAM;
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
}
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
}
mca_base_param_reg_int_name("hwloc", "base_bind_to_socket",
"Bind processes to sockets",
false, false, (int)false, &value);
if (value) {
/* set binding policy to socket - error if something else already set */
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) != OPAL_BIND_TO_SOCKET) {
/* error - cannot redefine the default ranking policy */
opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
"socket", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
return OPAL_ERR_SILENT;
}
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_SOCKET);
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
}
mca_base_param_reg_int_name("hwloc", "base_report_bindings",
"Report bindings to stderr",
false, false, (int)false, &value);
opal_hwloc_report_bindings = OPAL_INT_TO_BOOL(value);
/* did the user provide a slot list? */
tmp = mca_base_param_reg_string_name("hwloc", "base_slot_list",
"List of processor IDs to bind processes to [default=NULL]",
false, false, NULL, &opal_hwloc_base_slot_list);
if (NULL != opal_hwloc_base_slot_list) {
/* if we already were given a policy, then this is an error */
if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
"socket", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
return OPAL_ERR_SILENT;
}
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET);
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
}
/* cpu allocation specification */
mca_base_param_reg_string_name("hwloc", "base_cpu_set",
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
"Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]",
false, false, NULL, &opal_hwloc_base_cpu_set);
if (NULL != opal_hwloc_base_cpu_set) {
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
/* it is okay if a binding policy was already given - just ensure that
* we do bind to the given cpus if provided, otherwise this would be
* ignored if someone didn't also specify a binding policy
*/
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET);
opal_hwloc_binding_policy |= OPAL_BIND_GIVEN;
}
}
/* to support tools such as ompi_info, add the components
* to a list
*/
OBJ_CONSTRUCT(&opal_hwloc_base_components, opal_list_t);
if (OPAL_SUCCESS !=
mca_base_components_open("hwloc", opal_hwloc_base_output,
mca_hwloc_base_static_components,
&opal_hwloc_base_components, true)) {
return OPAL_ERROR;
}
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
/* declare hwthreads as independent cpus */
mca_base_param_reg_int_name("hwloc", "base_use_hwthreads_as_cpus",
"Use hardware threads as independent cpus",
false, false, (int)false, &value);
opal_hwloc_use_hwthreads_as_cpus = OPAL_INT_TO_BOOL(value);
/* declare the hwloc data types */
tmp = OPAL_HWLOC_TOPO;
if (OPAL_SUCCESS != (value = opal_dss.register_type(opal_hwloc_pack,
opal_hwloc_unpack,
(opal_dss_copy_fn_t)opal_hwloc_copy,
(opal_dss_compare_fn_t)opal_hwloc_compare,
(opal_dss_print_fn_t)opal_hwloc_print,
OPAL_DSS_STRUCTURED,
"OPAL_HWLOC_TOPO", &tmp))) {
return value;
}
}
#endif
return OPAL_SUCCESS;
}
static bool fns_init=false;
static opal_tsd_key_t print_tsd_key;
static char* opal_hwloc_print_null = "NULL";
static void buffer_cleanup(void *value)
{
int i;
opal_hwloc_print_buffers_t *ptr;
if (NULL != value) {
ptr = (opal_hwloc_print_buffers_t*)value;
for (i=0; i < OPAL_HWLOC_PRINT_NUM_BUFS; i++) {
free(ptr->buffers[i]);
}
}
}
opal_hwloc_print_buffers_t *opal_hwloc_get_print_buffer(void)
{
opal_hwloc_print_buffers_t *ptr;
int ret, i;
if (!fns_init) {
/* setup the print_args function */
if (OPAL_SUCCESS != (ret = opal_tsd_key_create(&print_tsd_key, buffer_cleanup))) {
return NULL;
}
fns_init = true;
}
ret = opal_tsd_getspecific(print_tsd_key, (void**)&ptr);
if (OPAL_SUCCESS != ret) return NULL;
if (NULL == ptr) {
ptr = (opal_hwloc_print_buffers_t*)malloc(sizeof(opal_hwloc_print_buffers_t));
for (i=0; i < OPAL_HWLOC_PRINT_NUM_BUFS; i++) {
ptr->buffers[i] = (char *) malloc((OPAL_HWLOC_PRINT_MAX_SIZE+1) * sizeof(char));
}
ptr->cntr = 0;
ret = opal_tsd_setspecific(print_tsd_key, (void*)ptr);
}
return (opal_hwloc_print_buffers_t*) ptr;
}
char* opal_hwloc_base_print_locality(opal_hwloc_locality_t locality)
{
opal_hwloc_print_buffers_t *ptr;
int idx;
ptr = opal_hwloc_get_print_buffer();
if (NULL == ptr) {
return opal_hwloc_print_null;
}
/* cycle around the ring */
if (OPAL_HWLOC_PRINT_NUM_BUFS == ptr->cntr) {
ptr->cntr = 0;
}
idx = 0;
if (OPAL_PROC_ON_LOCAL_CLUSTER(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'C';
ptr->buffers[ptr->cntr][idx++] = 'L';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_CU(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'C';
ptr->buffers[ptr->cntr][idx++] = 'U';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_NODE(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'N';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_BOARD(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'B';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_NUMA(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'N';
ptr->buffers[ptr->cntr][idx++] = 'u';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_SOCKET(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'S';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_L3CACHE(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'L';
ptr->buffers[ptr->cntr][idx++] = '3';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_L2CACHE(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'L';
ptr->buffers[ptr->cntr][idx++] = '2';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_L1CACHE(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'L';
ptr->buffers[ptr->cntr][idx++] = '1';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_CORE(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'C';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (OPAL_PROC_ON_LOCAL_HWTHREAD(locality)) {
ptr->buffers[ptr->cntr][idx++] = 'H';
ptr->buffers[ptr->cntr][idx++] = 'w';
ptr->buffers[ptr->cntr][idx++] = 't';
ptr->buffers[ptr->cntr][idx++] = ':';
}
if (0 < idx) {
ptr->buffers[ptr->cntr][idx-1] = '\0';
} else if (OPAL_PROC_NON_LOCAL & locality) {
ptr->buffers[ptr->cntr][idx++] = 'N';
ptr->buffers[ptr->cntr][idx++] = 'O';
ptr->buffers[ptr->cntr][idx++] = 'N';
ptr->buffers[ptr->cntr][idx++] = '\0';
} else {
/* must be an unknown locality */
ptr->buffers[ptr->cntr][idx++] = 'U';
ptr->buffers[ptr->cntr][idx++] = 'N';
ptr->buffers[ptr->cntr][idx++] = 'K';
ptr->buffers[ptr->cntr][idx++] = '\0';
}
return ptr->buffers[ptr->cntr];
}
#if OPAL_HAVE_HWLOC
static void obj_data_const(opal_hwloc_obj_data_t *ptr)
{
ptr->available = NULL;
ptr->npus = 0;
At long last, the fabled revision to the affinity system has arrived. A more detailed explanation of how this all works will be presented here: https://svn.open-mpi.org/trac/ompi/wiki/ProcessPlacement The wiki page is incomplete at the moment, but I hope to complete it over the next few days. I will provide updates on the devel list. As the wiki page states, the default and most commonly used options remain unchanged (except as noted below). New, esoteric and complex options have been added, but unless you are a true masochist, you are unlikely to use many of them beyond perhaps an initial curiosity-motivated experimentation. In a nutshell, this commit revamps the map/rank/bind procedure to take into account topology info on the compute nodes. I have, for the most part, preserved the default behaviors, with three notable exceptions: 1. I have at long last bowed my head in submission to the system admin's of managed clusters. For years, they have complained about our default of allowing users to oversubscribe nodes - i.e., to run more processes on a node than allocated slots. Accordingly, I have modified the default behavior: if you are running off of hostfile/dash-host allocated nodes, then the default is to allow oversubscription. If you are running off of RM-allocated nodes, then the default is to NOT allow oversubscription. Flags to override these behaviors are provided, so this only affects the default behavior. 2. both cpus/rank and stride have been removed. The latter was demanded by those who didn't understand the purpose behind it - and I agreed as the users who requested it are no longer using it. The former was removed temporarily pending implementation. 3. vm launch is now the sole method for starting OMPI. It was just too darned hard to maintain multiple launch procedures - maybe someday, provided someone can demonstrate a reason to do so. As Jeff stated, it is impossible to fully test a change of this size. I have tested it on Linux and Mac, covering all the default and simple options, singletons, and comm_spawn. That said, I'm sure others will find problems, so I'll be watching MTT results until this stabilizes. This commit was SVN r25476.
2011-11-15 07:40:11 +04:00
ptr->idx = UINT_MAX;
ptr->num_bound = 0;
}
static void obj_data_dest(opal_hwloc_obj_data_t *ptr)
{
if (NULL != ptr->available) {
hwloc_bitmap_free(ptr->available);
}
}
OBJ_CLASS_INSTANCE(opal_hwloc_obj_data_t,
opal_object_t,
obj_data_const, obj_data_dest);
static void sum_const(opal_hwloc_summary_t *ptr)
{
ptr->num_objs = 0;
ptr->rtype = 0;
}
OBJ_CLASS_INSTANCE(opal_hwloc_summary_t,
opal_list_item_t,
sum_const, NULL);
static void topo_data_const(opal_hwloc_topo_data_t *ptr)
{
ptr->available = NULL;
OBJ_CONSTRUCT(&ptr->summaries, opal_list_t);
ptr->userdata = NULL;
}
static void topo_data_dest(opal_hwloc_topo_data_t *ptr)
{
opal_list_item_t *item;
if (NULL != ptr->available) {
hwloc_bitmap_free(ptr->available);
}
while (NULL != (item = opal_list_remove_first(&ptr->summaries))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&ptr->summaries);
ptr->userdata = NULL;
}
OBJ_CLASS_INSTANCE(opal_hwloc_topo_data_t,
opal_object_t,
topo_data_const,
topo_data_dest);
#endif