This commit introduces a new "mindist" ORTE RMAPS mapper, as well as
some relevant updates/new functionality in the opal/mca/hwloc and orte/mca/rmaps bases. This work was mainly developed by Mellanox, with a bunch of advice from Ralph Castain, and some minor advice from Brice Goglin and Jeff Squyres. Even though this is mainly Mellanox's work, Jeff is committing only for logistical reasons (he holds the hg+svn combo tree, and can therefore commit it directly back to SVN). ----- Implemented distance-based mapping algorithm as a new "mindist" component in the rmaps framework. It allows mapping processes by NUMA due to PCI locality information as reported by the BIOS - from the closest to device to furthest. To use this algorithm, specify: {{{mpirun --map-by dist:<device_name>}}} where <device_name> can be mlx5_0, ib0, etc. There are two modes provided: 1. bynode: load-balancing across nodes 1. byslot: go through slots sequentially (i.e., the first nodes are more loaded) These options are regulated by the optional ''span'' modifier; the command line parameter looks like: {{{mpirun --map-by dist:<device_name>,span}}} So, for example, if there are 2 nodes, each with 8 cores, and we'd like to run 10 processes, the mindist algorithm will place 8 processes to the first node and 2 to the second by default. But if you want to place 5 processes to each node, you can add a span modifier in your command line to do that. If there are two NUMA nodes on the node, each with 4 cores, and we run 6 processes, the mindist algorithm will try to find the NUMA closest to the specified device, and if successful, it will place 4 processes on that NUMA but leaving the remaining two to the next NUMA node. You can also specify the number of cpus per MPI process. This option is handled so that we map as many processes to the closest NUMA as we can (number of available processors at the NUMA divided by number of cpus per rank) and then go on with the next closest NUMA. The default binding option for this mapping is bind-to-numa. It works if you don't specify any binding policy. But if you specified binding level that was "lower" than NUMA (i.e hwthread, core, socket) it would bind to whatever level you specify. This commit was SVN r28552.
Этот коммит содержится в:
родитель
55382c1bf8
Коммит
6d173af329
@ -93,6 +93,14 @@ OPAL_DECLSPEC opal_hwloc_locality_t opal_hwloc_base_get_relative_locality(hwloc_
|
||||
*/
|
||||
OPAL_DECLSPEC void opal_hwloc_base_get_local_cpuset(void);
|
||||
|
||||
struct orte_rmaps_numa_node_t {
|
||||
opal_list_item_t super;
|
||||
int index;
|
||||
float dist_from_closed;
|
||||
};
|
||||
typedef struct orte_rmaps_numa_node_t orte_rmaps_numa_node_t;
|
||||
OBJ_CLASS_DECLARATION(orte_rmaps_numa_node_t);
|
||||
|
||||
/**
|
||||
* Enum for what memory allocation policy we want for user allocations.
|
||||
* MAP = memory allocation policy.
|
||||
@ -156,6 +164,10 @@ OPAL_DECLSPEC unsigned int opal_hwloc_base_get_obj_idx(hwloc_topology_t topo,
|
||||
hwloc_obj_t obj,
|
||||
opal_hwloc_resource_type_t rtype);
|
||||
|
||||
OPAL_DECLSPEC void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo,
|
||||
const char* device_name,
|
||||
opal_list_t *sorted_list);
|
||||
|
||||
/**
|
||||
* Get the number of pu's under a given hwloc object.
|
||||
*/
|
||||
|
@ -28,7 +28,6 @@ int opal_hwloc_pack(opal_buffer_t *buffer, const void *src,
|
||||
for (i=0; i < num_vals; i++) {
|
||||
t = tarray[i];
|
||||
|
||||
|
||||
/* extract an xml-buffer representation of the tree */
|
||||
if (0 != hwloc_topology_export_xmlbuffer(t, &xmlbuffer, &len)) {
|
||||
return OPAL_ERROR;
|
||||
@ -44,6 +43,7 @@ int opal_hwloc_pack(opal_buffer_t *buffer, const void *src,
|
||||
if (NULL != xmlbuffer) {
|
||||
free(xmlbuffer);
|
||||
}
|
||||
|
||||
/* get the available support - hwloc unfortunately does
|
||||
* not include this info in its xml export!
|
||||
*/
|
||||
@ -102,7 +102,7 @@ int opal_hwloc_unpack(opal_buffer_t *buffer, void *dest,
|
||||
/* since we are loading this from an external source, we have to
|
||||
* explicitly set a flag so hwloc sets things up correctly
|
||||
*/
|
||||
if (0 != hwloc_topology_set_flags(t, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM)) {
|
||||
if (0 != hwloc_topology_set_flags(t, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES)) {
|
||||
free(xmlbuffer);
|
||||
rc = OPAL_ERROR;
|
||||
hwloc_topology_destroy(t);
|
||||
@ -118,6 +118,7 @@ int opal_hwloc_unpack(opal_buffer_t *buffer, void *dest,
|
||||
if (NULL != xmlbuffer) {
|
||||
free(xmlbuffer);
|
||||
}
|
||||
|
||||
/* get the available support - hwloc unfortunately does
|
||||
* not include this info in its xml import!
|
||||
*/
|
||||
@ -134,6 +135,7 @@ int opal_hwloc_unpack(opal_buffer_t *buffer, void *dest,
|
||||
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, support->membind, &cnt, OPAL_BYTE))) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* pass it back */
|
||||
tarray[i] = t;
|
||||
|
||||
@ -148,6 +150,7 @@ int opal_hwloc_unpack(opal_buffer_t *buffer, void *dest,
|
||||
|
||||
int opal_hwloc_copy(hwloc_topology_t *dest, hwloc_topology_t src, opal_data_type_t type)
|
||||
{
|
||||
int i;
|
||||
char *xml;
|
||||
int len;
|
||||
struct hwloc_topology_support *support, *destsupport;
|
||||
@ -205,7 +208,6 @@ int opal_hwloc_compare(const hwloc_topology_t topo1,
|
||||
return OPAL_VALUE2_GREATER;
|
||||
}
|
||||
|
||||
|
||||
/* do the comparison the "cheat" way - get an xml representation
|
||||
* of each tree, and strcmp!
|
||||
*/
|
||||
|
@ -452,10 +452,19 @@ static void sum_const(opal_hwloc_summary_t *ptr)
|
||||
{
|
||||
ptr->num_objs = 0;
|
||||
ptr->rtype = 0;
|
||||
OBJ_CONSTRUCT(&ptr->sorted_by_dist_list, opal_list_t);
|
||||
}
|
||||
static void sum_dest(opal_hwloc_summary_t *ptr)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
while (NULL != (item = opal_list_remove_first(&ptr->sorted_by_dist_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&ptr->sorted_by_dist_list);
|
||||
}
|
||||
OBJ_CLASS_INSTANCE(opal_hwloc_summary_t,
|
||||
opal_list_item_t,
|
||||
sum_const, NULL);
|
||||
sum_const, sum_dest);
|
||||
static void topo_data_const(opal_hwloc_topo_data_t *ptr)
|
||||
{
|
||||
ptr->available = NULL;
|
||||
@ -479,4 +488,9 @@ OBJ_CLASS_INSTANCE(opal_hwloc_topo_data_t,
|
||||
opal_object_t,
|
||||
topo_data_const,
|
||||
topo_data_dest);
|
||||
|
||||
OBJ_CLASS_INSTANCE(orte_rmaps_numa_node_t,
|
||||
opal_list_item_t,
|
||||
NULL,
|
||||
NULL);
|
||||
#endif
|
||||
|
@ -214,7 +214,7 @@ int opal_hwloc_base_get_topology(void)
|
||||
if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
|
||||
0 != hwloc_topology_set_flags(opal_hwloc_topology,
|
||||
(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
|
||||
HWLOC_TOPOLOGY_FLAG_WHOLE_IO)) ||
|
||||
HWLOC_TOPOLOGY_FLAG_IO_DEVICES)) ||
|
||||
0 != hwloc_topology_load(opal_hwloc_topology)) {
|
||||
return OPAL_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
@ -844,15 +844,17 @@ hwloc_obj_t opal_hwloc_base_find_min_bound_target_under_obj(hwloc_topology_t top
|
||||
|
||||
loc = df_search_min_bound(topo, obj, target, cache_level, &min_bound);
|
||||
|
||||
if (HWLOC_OBJ_CACHE == target) {
|
||||
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
|
||||
"hwloc:base:min_bound_under_obj found min bound of %u on %s:%u:%u",
|
||||
min_bound, hwloc_obj_type_string(target),
|
||||
cache_level, loc->logical_index));
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
|
||||
"hwloc:base:min_bound_under_obj found min bound of %u on %s:%u",
|
||||
min_bound, hwloc_obj_type_string(target), loc->logical_index));
|
||||
if (NULL != loc) {
|
||||
if (HWLOC_OBJ_CACHE == target) {
|
||||
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
|
||||
"hwloc:base:min_bound_under_obj found min bound of %u on %s:%u:%u",
|
||||
min_bound, hwloc_obj_type_string(target),
|
||||
cache_level, loc->logical_index));
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output,
|
||||
"hwloc:base:min_bound_under_obj found min bound of %u on %s:%u",
|
||||
min_bound, hwloc_obj_type_string(target), loc->logical_index));
|
||||
}
|
||||
}
|
||||
|
||||
return loc;
|
||||
@ -1631,3 +1633,130 @@ int opal_hwloc_base_cset2mapstr(char *str, int len, hwloc_cpuset_t cpuset)
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int dist_cmp_fn (opal_list_item_t **a, opal_list_item_t **b)
|
||||
{
|
||||
orte_rmaps_numa_node_t *aitem = *((orte_rmaps_numa_node_t **) a);
|
||||
orte_rmaps_numa_node_t *bitem = *((orte_rmaps_numa_node_t **) b);
|
||||
|
||||
if (bitem->dist_from_closed > aitem->dist_from_closed) {
|
||||
return 1;
|
||||
} else if( aitem->dist_from_closed == bitem->dist_from_closed ) {
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static void sort_by_dist(hwloc_topology_t topo, const char* device_name, opal_list_t *sorted_list)
|
||||
{
|
||||
hwloc_obj_t device_obj = NULL;
|
||||
hwloc_obj_t obj = NULL, root = NULL;
|
||||
const struct hwloc_distances_s* distances;
|
||||
opal_list_item_t *numa_item;
|
||||
orte_rmaps_numa_node_t *numa_node;
|
||||
int close_node_index;
|
||||
float latency;
|
||||
int j;
|
||||
int depth;
|
||||
unsigned i;
|
||||
|
||||
for (device_obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_OS_DEVICE, 0); device_obj; device_obj = hwloc_get_next_osdev(topo, device_obj)) {
|
||||
if (device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS
|
||||
|| device_obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) {
|
||||
if (!strcmp(device_obj->name, device_name)) {
|
||||
/* find numa node containing this device */
|
||||
obj = device_obj->parent;
|
||||
while ((obj != NULL) && (obj->type != HWLOC_OBJ_NODE)) {
|
||||
obj = obj->parent;
|
||||
}
|
||||
if (obj == NULL) {
|
||||
return;
|
||||
} else {
|
||||
close_node_index = obj->logical_index;
|
||||
}
|
||||
|
||||
/* find distance matrix for all numa nodes */
|
||||
distances = hwloc_get_whole_distance_matrix_by_type(topo, HWLOC_OBJ_NODE);
|
||||
if (NULL == distances) {
|
||||
/* we can try to find distances under group object. This info can be there. */
|
||||
depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NODE);
|
||||
if (depth < 0) {
|
||||
return;
|
||||
}
|
||||
root = hwloc_get_root_obj(topo);
|
||||
for (i = 0; i < root->arity; i++) {
|
||||
obj = root->children[i];
|
||||
if (obj->distances_count > 0) {
|
||||
for(j = 0; j < obj->distances_count; j++) {
|
||||
if (obj->distances[j]->relative_depth + 1 == depth) {
|
||||
distances = obj->distances[j];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* find all distances for our close node with logical index = close_node_index as close_node_index + nbobjs*j */
|
||||
if ((NULL == distances) || (0 == distances->nbobjs)) {
|
||||
return;
|
||||
}
|
||||
/* fill list of numa nodes */
|
||||
for (j = 0; j < distances->nbobjs; j++) {
|
||||
latency = distances->latency[close_node_index + distances->nbobjs * j];
|
||||
numa_node = OBJ_NEW(orte_rmaps_numa_node_t);
|
||||
numa_node->index = j;
|
||||
numa_node->dist_from_closed = latency;
|
||||
opal_list_append(sorted_list, &numa_node->super);
|
||||
}
|
||||
/* sort numa nodes by distance from the closest one to PCI */
|
||||
opal_list_sort(sorted_list, dist_cmp_fn);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, const char* device_name, opal_list_t *sorted_list)
|
||||
{
|
||||
hwloc_obj_t obj;
|
||||
opal_list_item_t *item;
|
||||
opal_hwloc_summary_t *sum;
|
||||
opal_hwloc_topo_data_t *data;
|
||||
orte_rmaps_numa_node_t *numa, *copy_numa;
|
||||
|
||||
obj = hwloc_get_root_obj(topo);
|
||||
|
||||
/* first see if the topology already has this info */
|
||||
/* we call opal_hwloc_base_get_nbobjs_by_type() before it to fill summary object so it should exist*/
|
||||
data = (opal_hwloc_topo_data_t*)obj->userdata;
|
||||
if (NULL != data) {
|
||||
for (item = opal_list_get_first(&data->summaries);
|
||||
item != opal_list_get_end(&data->summaries);
|
||||
item = opal_list_get_next(item)) {
|
||||
sum = (opal_hwloc_summary_t*)item;
|
||||
if (HWLOC_OBJ_NODE == sum->type) {
|
||||
if (opal_list_get_size(&sum->sorted_by_dist_list) > 0) {
|
||||
OPAL_LIST_FOREACH(numa, &(sum->sorted_by_dist_list), orte_rmaps_numa_node_t) {
|
||||
copy_numa = OBJ_NEW(orte_rmaps_numa_node_t);
|
||||
copy_numa->index = numa->index;
|
||||
copy_numa->dist_from_closed = numa->dist_from_closed;
|
||||
opal_list_append(sorted_list, ©_numa->super);
|
||||
}
|
||||
return;
|
||||
}else {
|
||||
/* don't already know it - go get it */
|
||||
sort_by_dist(topo, device_name, sorted_list);
|
||||
/* store this info in summary object for later usage */
|
||||
OPAL_LIST_FOREACH(numa, sorted_list, orte_rmaps_numa_node_t) {
|
||||
copy_numa = OBJ_NEW(orte_rmaps_numa_node_t);
|
||||
copy_numa->index = numa->index;
|
||||
copy_numa->dist_from_closed = numa->dist_from_closed;
|
||||
opal_list_append(&(sum->sorted_by_dist_list), ©_numa->super);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -152,6 +152,7 @@ typedef struct {
|
||||
unsigned cache_level;
|
||||
unsigned int num_objs;
|
||||
opal_hwloc_resource_type_t rtype;
|
||||
opal_list_t sorted_by_dist_list;
|
||||
} opal_hwloc_summary_t;
|
||||
OBJ_CLASS_DECLARATION(opal_hwloc_summary_t);
|
||||
|
||||
|
@ -68,6 +68,8 @@ typedef struct {
|
||||
/* default mapping directives */
|
||||
orte_mapping_policy_t mapping;
|
||||
orte_ranking_policy_t ranking;
|
||||
/* device specification for min distance mapping */
|
||||
char *device;
|
||||
} orte_rmaps_base_t;
|
||||
|
||||
/**
|
||||
|
@ -604,9 +604,7 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
||||
|
||||
/* binding requested */
|
||||
/* if the job was mapped by the corresponding target, then
|
||||
* there is nothing more to do - the launch message creator
|
||||
* will see that the binding object is NULL and will simply
|
||||
* use the locale as the place to bind the proc
|
||||
* we bind in place
|
||||
*
|
||||
* otherwise, we have to bind either up or down the hwloc
|
||||
* tree. If we are binding upwards (e.g., mapped to hwthread
|
||||
@ -617,7 +615,50 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
|
||||
* to core), then we have to do a round-robin assigment of
|
||||
* procs to the resources below.
|
||||
*/
|
||||
if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
|
||||
if (ORTE_MAPPING_BYDIST == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
int rc = ORTE_SUCCESS;
|
||||
if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps: bindings for job %s - dist to numa",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else if (OPAL_BIND_TO_NUMA < OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_PU, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CORE, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 1))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 2))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_CACHE, 3))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
} else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
if (ORTE_SUCCESS != (rc = bind_downwards(jdata, HWLOC_OBJ_SOCKET, 0))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if the binding policy is less than numa, then we are unbound - so
|
||||
* just ignore this and return (should have been caught in prior
|
||||
* tests anyway as only options meeting that criteria are "none"
|
||||
* and "board")
|
||||
*/
|
||||
return rc;
|
||||
} else if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) {
|
||||
int rc;
|
||||
if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
|
@ -36,7 +36,6 @@
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
|
||||
/*
|
||||
* The following file was created by configure. It contains extern
|
||||
* statements and the definition of an array of pointers to each
|
||||
@ -101,7 +100,7 @@ static int orte_rmaps_base_register(mca_base_register_flag_t flags)
|
||||
rmaps_base_mapping_policy = NULL;
|
||||
var_id = mca_base_var_register("orte", "rmaps", "base", "mapping_policy",
|
||||
#if OPAL_HAVE_HWLOC
|
||||
"Mapping Policy [slot (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board | node | seq], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
|
||||
"Mapping Policy [slot (default) | hwthread | core | l1cache | l2cache | l3cache | socket | numa | board | node | seq | dist], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
|
||||
#else
|
||||
"Mapping Policy [slot (default) | node], with allowed modifiers :SPAN,OVERSUBSCRIBE,NOOVERSUBSCRIBE",
|
||||
#endif
|
||||
@ -250,6 +249,25 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
if (2 == opal_argv_count(ck)) {
|
||||
/* if the policy is "dist", then we set the policy to that value
|
||||
* and save the second argument as the device
|
||||
*/
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (0 == strncasecmp(ck[0], "dist", len)) {
|
||||
tmp = ORTE_MAPPING_BYDIST;
|
||||
ck2 = opal_argv_split(ck[1], ',');
|
||||
if (ck2[0] != NULL) {
|
||||
orte_rmaps_base.device = strdup(ck2[0]);
|
||||
for (i=1; NULL != ck2[i]; i++) {
|
||||
if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) {
|
||||
orte_rmaps_base.mapping |= ORTE_MAPPING_SPAN;
|
||||
}
|
||||
}
|
||||
}
|
||||
opal_argv_free(ck2);
|
||||
goto setpolicy;
|
||||
}
|
||||
#endif
|
||||
ck2 = opal_argv_split(ck[1], ',');
|
||||
for (i=0; NULL != ck2[i]; i++) {
|
||||
if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) {
|
||||
@ -314,6 +332,7 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
|
||||
opal_argv_free(ck);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
setpolicy:
|
||||
ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, tmp);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
|
||||
opal_argv_free(ck);
|
||||
@ -418,15 +437,21 @@ static int orte_rmaps_base_open(mca_base_open_flag_t flags)
|
||||
}
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* if the cpus/rank > 1, then we have to bind to cores UNLESS the binding has
|
||||
* already been set to something else
|
||||
*/
|
||||
if (1 < orte_rmaps_base.cpus_per_rank &&
|
||||
!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
||||
if (opal_hwloc_use_hwthreads_as_cpus) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
|
||||
} else {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
|
||||
if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
|
||||
/* if MAP BY DIST then we set binding policy to numa UNLESS the binding has
|
||||
* already been set to something else
|
||||
*/
|
||||
if (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYDIST) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_NUMA);
|
||||
} else if (1 < orte_rmaps_base.cpus_per_rank) {
|
||||
/* if the cpus/rank > 1, then we have to bind to cores UNLESS the binding has
|
||||
* already been set to something else
|
||||
*/
|
||||
if (opal_hwloc_use_hwthreads_as_cpus) {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD);
|
||||
} else {
|
||||
OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -155,6 +155,9 @@ char* orte_rmaps_base_print_mapping(orte_mapping_policy_t mapping)
|
||||
case ORTE_MAPPING_BYUSER:
|
||||
map = "BYUSER";
|
||||
break;
|
||||
case ORTE_MAPPING_BYDIST:
|
||||
map = "MINDIST";
|
||||
break;
|
||||
default:
|
||||
if (ORTE_MAPPING_PPR & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
|
||||
map = "PPR";
|
||||
|
47
orte/mca/rmaps/mindist/Makefile.am
Обычный файл
47
orte/mca/rmaps/mindist/Makefile.am
Обычный файл
@ -0,0 +1,47 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-orte-rmaps-md.txt
|
||||
|
||||
sources = \
|
||||
rmaps_mindist.h \
|
||||
rmaps_mindist_module.c \
|
||||
rmaps_mindist_component.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_rmaps_mindist_DSO
|
||||
component_noinst =
|
||||
component_install = mca_rmaps_mindist.la
|
||||
else
|
||||
component_noinst = libmca_rmaps_mindist.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_rmaps_mindist_la_SOURCES = $(sources)
|
||||
mca_rmaps_mindist_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_rmaps_mindist_la_SOURCES =$(sources)
|
||||
libmca_rmaps_mindist_la_LDFLAGS = -module -avoid-version
|
19
orte/mca/rmaps/mindist/configure.m4
Обычный файл
19
orte/mca/rmaps/mindist/configure.m4
Обычный файл
@ -0,0 +1,19 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# MCA_rmaps_mindist_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_rmaps_mindist_CONFIG], [
|
||||
AC_CONFIG_FILES([orte/mca/rmaps/mindist/Makefile])
|
||||
|
||||
AS_IF([test "$OPAL_HAVE_HWLOC" = 1],
|
||||
[$1],
|
||||
[$2])
|
||||
])
|
31
orte/mca/rmaps/mindist/help-orte-rmaps-md.txt
Обычный файл
31
orte/mca/rmaps/mindist/help-orte-rmaps-md.txt
Обычный файл
@ -0,0 +1,31 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
#
|
||||
[multi-apps-and-zero-np]
|
||||
Open MPI found multiple applications to be launched, and at least one
|
||||
that failed to specify the number of processes to execute. When
|
||||
specifying multiple applications, you must specify how many processes
|
||||
of each to launch via the -np argument.
|
||||
#
|
||||
[orte-rmaps-mindist:no-pci-locality-info]
|
||||
No PCI locality information could be found on at least one node:
|
||||
|
||||
Node: %s
|
||||
|
||||
Open MPI therefore cannot mapp the application as specified.
|
41
orte/mca/rmaps/mindist/rmaps_mindist.h
Обычный файл
41
orte/mca/rmaps/mindist/rmaps_mindist.h
Обычный файл
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* Resource Mapping
|
||||
*/
|
||||
#ifndef ORTE_RMAPS_MINDIST_H
|
||||
#define ORTE_RMAPS_MINDIST_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "opal/mca/hwloc/hwloc.h"
|
||||
#include "opal/class/opal_list.h"
|
||||
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_mindist_component;
|
||||
extern orte_rmaps_base_module_t orte_rmaps_mindist_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
99
orte/mca/rmaps/mindist/rmaps_mindist_component.c
Обычный файл
99
orte/mca/rmaps/mindist/rmaps_mindist_component.c
Обычный файл
@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "rmaps_mindist.h"
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static int orte_rmaps_mindist_open(void);
|
||||
static int orte_rmaps_mindist_close(void);
|
||||
static int orte_rmaps_mindist_query(mca_base_module_t **module, int *priority);
|
||||
static int orte_rmaps_mindist_register(void);
|
||||
|
||||
static int my_priority = 20;
|
||||
|
||||
orte_rmaps_base_component_t mca_rmaps_mindist_component = {
|
||||
{
|
||||
ORTE_RMAPS_BASE_VERSION_2_0_0,
|
||||
|
||||
"mindist", /* MCA component name */
|
||||
ORTE_MAJOR_VERSION, /* MCA component major version */
|
||||
ORTE_MINOR_VERSION, /* MCA component minor version */
|
||||
ORTE_RELEASE_VERSION, /* MCA component release version */
|
||||
orte_rmaps_mindist_open, /* component open */
|
||||
orte_rmaps_mindist_close, /* component close */
|
||||
orte_rmaps_mindist_query, /* component query */
|
||||
orte_rmaps_mindist_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static int orte_rmaps_mindist_register(void)
|
||||
{
|
||||
(void) mca_base_component_var_register(&mca_rmaps_mindist_component.base_version,
|
||||
"priority", "Priority of the mindist rmaps component",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&my_priority);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* component open/close/init function
|
||||
*/
|
||||
static int orte_rmaps_mindist_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static int orte_rmaps_mindist_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* the RMAPS framework is -only- opened on HNP's,
|
||||
* so no need to check for that here
|
||||
*/
|
||||
|
||||
*priority = my_priority;
|
||||
*module = (mca_base_module_t *)&orte_rmaps_mindist_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all subsystems.
|
||||
*/
|
||||
|
||||
static int orte_rmaps_mindist_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
400
orte/mca/rmaps/mindist/rmaps_mindist_module.c
Обычный файл
400
orte/mca/rmaps/mindist/rmaps_mindist_module.c
Обычный файл
@ -0,0 +1,400 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <errno.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
|
||||
#include "opal/mca/base/mca_base_var.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/error_strings.h"
|
||||
|
||||
#include "orte/mca/rmaps/base/rmaps_private.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/rmaps/mindist/rmaps_mindist.h"
|
||||
|
||||
static int mindist_map(orte_job_t *jdata);
|
||||
|
||||
orte_rmaps_base_module_t orte_rmaps_mindist_module = {
|
||||
mindist_map
|
||||
};
|
||||
|
||||
/*
|
||||
* Create a round-robin mapping for the job.
|
||||
*/
|
||||
static int mindist_map(orte_job_t *jdata)
|
||||
{
|
||||
orte_app_context_t *app;
|
||||
int i, j;
|
||||
unsigned int k;
|
||||
hwloc_obj_t obj = NULL;
|
||||
opal_list_t node_list;
|
||||
opal_list_t numa_list;
|
||||
opal_list_item_t *item;
|
||||
opal_list_item_t *numa_item;
|
||||
orte_rmaps_numa_node_t *numa;
|
||||
orte_node_t *node;
|
||||
orte_proc_t *proc;
|
||||
int nprocs_mapped;
|
||||
int extra_procs, navg, nextra;
|
||||
orte_std_cntr_t num_nodes, num_slots;
|
||||
unsigned int npus, total_npus, num_procs_to_assign, required;
|
||||
int rc;
|
||||
mca_base_component_t *c = &mca_rmaps_mindist_component.base_version;
|
||||
bool initial_map=true;
|
||||
bool bynode = false;
|
||||
|
||||
/* this mapper can only handle initial launch
|
||||
* when mindist mapping is desired
|
||||
*/
|
||||
if (ORTE_JOB_CONTROL_RESTART & jdata->controls) {
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:mindist: job %s is being restarted - mindist cannot map",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
if (NULL != jdata->map->req_mapper &&
|
||||
0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
|
||||
/* a mapper has been specified, and it isn't me */
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:mindist: job %s not using mindist mapper",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
if (ORTE_MAPPING_BYDIST != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
|
||||
/* not me */
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:mindist: job %s not using mindist mapper",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
return ORTE_ERR_TAKE_NEXT_OPTION;
|
||||
}
|
||||
|
||||
/* there are two modes for mapping by dist: span and not-span. The
|
||||
* span mode essentially operates as if there was just a single
|
||||
* "super-node" in the system - i.e., it balances the load across
|
||||
* all objects of the indicated type regardless of their location.
|
||||
* In essence, it acts as if we placed one proc on each object, cycling
|
||||
* across all objects on all nodes, and then wrapped around to place
|
||||
* another proc on each object, doing so until all procs were placed.
|
||||
*
|
||||
* In contrast, the non-span mode operates similar to byslot mapping.
|
||||
* All slots on each node are filled, assigning each proc to an object
|
||||
* on that node in a balanced fashion, and then the mapper moves on
|
||||
* to the next node. Thus, procs tend to be "front loaded" onto the
|
||||
* list of nodes, as opposed to being "load balanced" in the span mode
|
||||
*/
|
||||
|
||||
if (ORTE_MAPPING_SPAN & jdata->map->mapping) {
|
||||
/* do a bynode mapping */
|
||||
bynode = true;
|
||||
} else {
|
||||
/* do a byslot mapping */
|
||||
bynode = false;
|
||||
}
|
||||
|
||||
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:mindist: mapping job %s",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
|
||||
/* flag that I did the mapping */
|
||||
if (NULL != jdata->map->last_mapper) {
|
||||
free(jdata->map->last_mapper);
|
||||
}
|
||||
jdata->map->last_mapper = strdup(c->mca_component_name);
|
||||
|
||||
/* start at the beginning... */
|
||||
jdata->num_procs = 0;
|
||||
|
||||
/* cycle through the app_contexts, mapping them sequentially */
|
||||
for(i=0; i < jdata->apps->size; i++) {
|
||||
if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* setup the nodelist here in case we jump to error */
|
||||
OBJ_CONSTRUCT(&node_list, opal_list_t);
|
||||
|
||||
/* if the number of processes wasn't specified, then we know there can be only
|
||||
* one app_context allowed in the launch, and that we are to launch it across
|
||||
* all available slots. We'll double-check the single app_context rule first
|
||||
*/
|
||||
if (0 == app->num_procs && 1 < jdata->num_apps) {
|
||||
orte_show_help("help-orte-rmaps-mindist.txt", "multi-apps-and-zero-np",
|
||||
true, jdata->num_apps, NULL);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* for each app_context, we have to get the list of nodes that it can
|
||||
* use since that can now be modified with a hostfile and/or -host
|
||||
* option
|
||||
*/
|
||||
if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
|
||||
jdata->map->mapping, initial_map, false))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
|
||||
/* flag that all subsequent requests should not reset the node->mapped flag */
|
||||
initial_map = false;
|
||||
|
||||
/* if a bookmark exists from some prior mapping, set us to start there */
|
||||
jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
|
||||
|
||||
if (0 == app->num_procs) {
|
||||
/* set the num_procs to equal the number of slots on these mapped nodes */
|
||||
app->num_procs = num_slots;
|
||||
}
|
||||
|
||||
nprocs_mapped = 0;
|
||||
if (!num_nodes) {
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
if (bynode) {
|
||||
/* calculate num_procs_to_assign for bynode case */
|
||||
navg = app->num_procs / num_nodes;
|
||||
nextra = app->num_procs - navg * num_nodes;
|
||||
num_procs_to_assign = navg;
|
||||
if (nextra > 0)
|
||||
num_procs_to_assign++;
|
||||
}
|
||||
|
||||
/* iterate through the list of nodes */
|
||||
for (item = opal_list_get_first(&node_list);
|
||||
item != opal_list_get_end(&node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
|
||||
if (NULL == node->topology) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology",
|
||||
true, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
/* get the root object as we are not assigning
|
||||
* locale except at the node level
|
||||
*/
|
||||
obj = hwloc_get_root_obj(node->topology);
|
||||
if (NULL == obj) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology",
|
||||
true, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* add the node to the map, if needed */
|
||||
if (!node->mapped) {
|
||||
if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto error;
|
||||
}
|
||||
node->mapped = true;
|
||||
OBJ_RETAIN(node); /* maintain accounting on object */
|
||||
jdata->map->num_nodes++;
|
||||
}
|
||||
|
||||
/* get the number of available pus */
|
||||
if (opal_hwloc_use_hwthreads_as_cpus) {
|
||||
total_npus = opal_hwloc_base_get_nbobjs_by_type(node->topology, HWLOC_OBJ_PU, 0, OPAL_HWLOC_AVAILABLE);
|
||||
} else {
|
||||
total_npus = opal_hwloc_base_get_nbobjs_by_type(node->topology, HWLOC_OBJ_CORE, 0, OPAL_HWLOC_AVAILABLE);
|
||||
}
|
||||
if (bynode) {
|
||||
if (total_npus < num_procs_to_assign * orte_rmaps_base.cpus_per_rank) {
|
||||
/* check if oversubscribing is allowed */
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
} else {
|
||||
node->oversubscribed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
OBJ_CONSTRUCT(&numa_list, opal_list_t);
|
||||
opal_hwloc_get_sorted_numa_list(node->topology, orte_rmaps_base.device, &numa_list);
|
||||
if (opal_list_get_size(&numa_list) > 0) {
|
||||
j = 0;
|
||||
required = 0;
|
||||
OPAL_LIST_FOREACH(numa, &numa_list, orte_rmaps_numa_node_t) {
|
||||
/* get the hwloc object for this numa */
|
||||
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, HWLOC_OBJ_NODE, 0, numa->index, OPAL_HWLOC_AVAILABLE))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
npus = opal_hwloc_base_get_npus(node->topology, obj);
|
||||
if (bynode) {
|
||||
required = ((num_procs_to_assign-j) > npus/orte_rmaps_base.cpus_per_rank) ? (npus/orte_rmaps_base.cpus_per_rank) : (num_procs_to_assign-j);
|
||||
} else {
|
||||
required = npus/orte_rmaps_base.cpus_per_rank;
|
||||
}
|
||||
for (k = 0; (k < required) && (nprocs_mapped < app->num_procs); k++) {
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, i))) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
j++;
|
||||
proc->locale = obj;
|
||||
}
|
||||
if ((nprocs_mapped == (int)app->num_procs) || (bynode && ((int)num_procs_to_assign == j))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:mindist: assigned %d procs to node %s",
|
||||
j, node->name);
|
||||
} else {
|
||||
/* don't have info about pci locality */
|
||||
orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:no-pci-locality-info",
|
||||
true, node->name);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
while (NULL != (numa_item = opal_list_remove_first(&numa_list))) {
|
||||
OBJ_RELEASE(numa_item);
|
||||
}
|
||||
OBJ_DESTRUCT(&numa_list);
|
||||
if (bynode) {
|
||||
nextra--;
|
||||
if (nextra == 0) {
|
||||
num_procs_to_assign--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* If we get to the end of all the nodes and still have procs remaining, then
|
||||
* we check the oversubscribed flag - if oversubscription is allowed, then
|
||||
* begin assigning procs round-robin *bynode* until all procs have been assigned.
|
||||
* This ensures that the overload is evenly distributed across all nodes.
|
||||
*/
|
||||
|
||||
extra_procs = app->num_procs - nprocs_mapped;
|
||||
if (extra_procs > 0) {
|
||||
/* check if oversubscribing is allowed */
|
||||
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
|
||||
orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
|
||||
true, app->num_procs, app->app);
|
||||
rc = ORTE_ERR_SILENT;
|
||||
goto error;
|
||||
}
|
||||
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:mindist job %s is oversubscribed - performing second pass",
|
||||
ORTE_JOBID_PRINT(jdata->jobid));
|
||||
num_procs_to_assign = extra_procs/num_nodes;
|
||||
nextra = extra_procs % num_nodes;
|
||||
if (nextra > 0) {
|
||||
num_procs_to_assign++;
|
||||
}
|
||||
for (item = opal_list_get_first(&node_list);
|
||||
item != opal_list_get_end(&node_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
node = (orte_node_t*)item;
|
||||
|
||||
if (nprocs_mapped == app->num_procs)
|
||||
break;
|
||||
node->oversubscribed = true;
|
||||
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
|
||||
"mca:rmaps:mindist: second pass assigning %d extra procs to node %s",
|
||||
(int)num_procs_to_assign, node->name);
|
||||
OBJ_CONSTRUCT(&numa_list, opal_list_t);
|
||||
opal_hwloc_get_sorted_numa_list(node->topology, orte_rmaps_base.device, &numa_list);
|
||||
if (opal_list_get_size(&numa_list) > 0) {
|
||||
numa_item = opal_list_get_first(&numa_list);
|
||||
k = 0;
|
||||
obj = hwloc_get_obj_by_type(node->topology, HWLOC_OBJ_NODE,((orte_rmaps_numa_node_t*)numa_item)->index);
|
||||
npus = opal_hwloc_base_get_npus(node->topology, obj);
|
||||
for (j = 0; j < (int)num_procs_to_assign && nprocs_mapped < (int)app->num_procs; j++) {
|
||||
if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, i))) {
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto error;
|
||||
}
|
||||
nprocs_mapped++;
|
||||
k++;
|
||||
proc->locale = obj;
|
||||
if (k > npus/orte_rmaps_base.cpus_per_rank-1) {
|
||||
numa_item = opal_list_get_next(numa_item);
|
||||
if (numa_item == opal_list_get_end(&numa_list)) {
|
||||
numa_item = opal_list_get_first(&numa_list);
|
||||
}
|
||||
obj = hwloc_get_obj_by_type(node->topology, HWLOC_OBJ_NODE,((orte_rmaps_numa_node_t*)numa_item)->index);
|
||||
npus = opal_hwloc_base_get_npus(node->topology, obj);
|
||||
k = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
while (NULL != (numa_item = opal_list_remove_first(&numa_list))) {
|
||||
OBJ_RELEASE(numa_item);
|
||||
}
|
||||
OBJ_DESTRUCT(&numa_list);
|
||||
nextra--;
|
||||
if (nextra == 0) {
|
||||
num_procs_to_assign--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* compute vpids and add proc objects to the job - do this after
|
||||
* each app_context so that the ranks within each context are
|
||||
* contiguous
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* track the total number of processes we mapped - must update
|
||||
* this value AFTER we compute vpids so that computation
|
||||
* is done correctly
|
||||
*/
|
||||
jdata->num_procs += app->num_procs;
|
||||
|
||||
/* cleanup the node list - it can differ from one app_context
|
||||
* to another, so we have to get it every time
|
||||
*/
|
||||
while (NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
while(NULL != (item = opal_list_remove_first(&node_list))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&node_list);
|
||||
|
||||
return rc;
|
||||
}
|
@ -111,6 +111,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_job_map_t);
|
||||
#define ORTE_MAPPING_BYL1CACHE 8
|
||||
#define ORTE_MAPPING_BYCORE 9
|
||||
#define ORTE_MAPPING_BYHWTHREAD 10
|
||||
#define ORTE_MAPPING_BYDIST 11
|
||||
/* convenience - declare anything <= 15 to be round-robin*/
|
||||
#define ORTE_MAPPING_RR 0x000f
|
||||
/* sequential policy */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user