1
1
openmpi/orte/mca/rmaps/lama/rmaps_lama_module.c

1904 строки
62 KiB
C

/*
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
*
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/hwloc/hwloc.h"
#include "opal/util/argv.h"
#include "opal/class/opal_tree.h"
#include "orte/util/show_help.h"
#include "orte/util/error_strings.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/rmaps_private.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/runtime/orte_globals.h"
#include "rmaps_lama.h"
#include MCA_timer_IMPLEMENTATION_HEADER
/*********************************
* Module setup
*********************************/
static int orte_rmaps_lama_map(orte_job_t *jdata);
orte_rmaps_base_module_t orte_rmaps_lama_module = {
orte_rmaps_lama_map
};
/*********************************
* Timer
*********************************/
#define RMAPS_LAMA_TIMER_TOTAL 0
#define RMAPS_LAMA_TIMER_PARSE_PARAMS 1
#define RMAPS_LAMA_TIMER_BUILD_MAX_TREE 2
#define RMAPS_LAMA_TIMER_MAPPING 3
#define RMAPS_LAMA_TIMER_ORDERING 4
#define RMAPS_LAMA_TIMER_MAX 5
static double rmaps_lama_get_time(void);
static void rmaps_lama_set_time(int idx, bool is_start);
static void rmaps_lama_display_all_timers(void);
static void rmaps_lama_clear_timers(void);
static void rmaps_lama_display_indv_timer_core(double diff, char *str);
static double timer_start[RMAPS_LAMA_TIMER_MAX];
static double timer_end[RMAPS_LAMA_TIMER_MAX];
static double timer_accum[RMAPS_LAMA_TIMER_MAX];
#define RMAPS_LAMA_CLEAR_TIMERS() \
{ \
if( rmaps_lama_timing_enabled ) { \
rmaps_lama_clear_timers(); \
} \
}
#define RMAPS_LAMA_START_TIMER(idx) \
{ \
if( rmaps_lama_timing_enabled ) { \
rmaps_lama_set_time(idx, true); \
} \
}
#define RMAPS_LAMA_END_TIMER(idx) \
{ \
if( rmaps_lama_timing_enabled ) { \
rmaps_lama_set_time(idx, false); \
} \
}
#define RMAPS_LAMA_DISPLAY_TIMERS() \
{ \
if( rmaps_lama_timing_enabled ) { \
rmaps_lama_display_all_timers(); \
} \
}
/*********************************
* Structures & Defines
*********************************/
static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item);
static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item);
OBJ_CLASS_INSTANCE(rmaps_lama_hwloc_user_t,
opal_object_t,
rmaps_lama_hwloc_user_construct,
rmaps_lama_hwloc_user_destruct);
/*********************************
* Globals
*********************************/
/*
* Mapping
*/
rmaps_lama_level_type_t *lama_mapping_layout = NULL;
static rmaps_lama_level_type_t *lama_mapping_layout_sort = NULL;
int lama_mapping_num_layouts = 0;
/*
* Binding
*/
rmaps_lama_level_type_t lama_binding_level = LAMA_LEVEL_UNKNOWN;
static int lama_binding_num_levels = 0;
/*
* MPPR
*/
rmaps_lama_level_info_t *lama_mppr_levels = NULL;
int lama_mppr_num_levels = 0;
/*
* Ordering
*/
static rmaps_lama_order_type_t lama_ordering = LAMA_ORDER_NATURAL;
/*
* Homogeneous system optimization
*/
bool lama_mppr_max_tree_homogeneous_system = false;
/*********************************
* Support Macros
*********************************/
/*********************************
* Support functions
*********************************/
/*
* Preprocess the command line arguments
*/
static int orte_rmaps_lama_process_params(orte_job_t *jdata);
/*
* Mapping Support:
* Core mapping function
*/
static int orte_rmaps_lama_map_core(orte_job_t *jdata);
/*
* Mapping Support:
* Recursive function for mapping process
*/
static int rmaps_lama_map_core_iter_level(orte_job_t *jdata,
orte_app_context_t *cur_app_context,
opal_list_t *node_list,
orte_node_t **cur_mach_ptr,
opal_tree_t *max_tree,
int cur_level,
int mach_level,
int **pu_idx_ref,
int **last_pu_idx_ref,
int *num_mapped,
int max_procs,
int *iter_passes);
/*
* Mapping Support:
* Access the next machine in the node list
*/
static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list,
opal_list_item_t *cur_mach);
/*
* Mapping Support:
* Check the availability of the requested slot on the specified node
*/
static int check_node_availability(orte_node_t *cur_node,
opal_tree_t *max_tree,
int *pu_idx_ref,
char **slot_list);
/*
* Mapping Support:
* Debugging PU display
*/
static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc);
static char * pu_ref_to_str(int *ref, int size);
/*
* Mapping Support:
* Convert the process layout 'layer' to the sorted position for the PU
*/
static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer);
/*
* MPPR Support:
* Check to make sure a process can be placed on this resource given the
* MPPR restrictions.
*/
static int rmaps_lama_check_mppr(orte_node_t *node,
hwloc_obj_t *child_obj);
static int rmaps_lama_iter_mppr_parents(orte_node_t *node,
hwloc_obj_t *child_obj,
bool check_only);
static int rmaps_lama_iter_mppr_children(orte_node_t *node,
hwloc_obj_t *child_obj,
bool check_only);
/*
* MPPR Support:
* Increment parents of this child to account for a process being placed
* on this resource.
*/
static int rmaps_lama_inc_mppr(orte_node_t *node,
hwloc_obj_t *child_obj);
/*
* Mapping Support:
* Return the native representation of the slot list
*/
static char * get_native_slot_list(orte_node_t *cur_node,
hwloc_obj_t *pu_obj,
int *put_idx_ref);
/*
* Ordering Support:
* Reorder sequentially
*/
static int rmaps_lama_ordering_sequential(orte_job_t *jdata);
/*
* Map a single process to a specific node
*/
static int orte_rmaps_lama_map_process(orte_job_t *jdata,
orte_node_t *node,
int app_idx,
orte_proc_t **proc);
/*********************************
* Main Module function to map a job
*********************************/
static int orte_rmaps_lama_map(orte_job_t *jdata)
{
int ret, exit_status = ORTE_SUCCESS;
mca_base_component_t *loc_comp = &mca_rmaps_lama_component.base_version;
RMAPS_LAMA_CLEAR_TIMERS();
RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_TOTAL);
/*
* Sanity Check:
* If we are not the 'chosen' mapper, then exit here
*/
if (NULL != jdata->map->req_mapper &&
0 != strcasecmp(jdata->map->req_mapper, loc_comp->mca_component_name)) {
/* a mapper has been specified, and it isn't me */
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: job %s not using lama mapper (using %s)",
ORTE_JOBID_PRINT(jdata->jobid),
jdata->map->req_mapper);
return ORTE_ERR_TAKE_NEXT_OPTION;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Mapping job %s",
ORTE_JOBID_PRINT(jdata->jobid));
/*
* Identify this as the mapper responsible for this job
*/
if (NULL != jdata->map->last_mapper) {
free(jdata->map->last_mapper);
}
jdata->map->last_mapper = strdup(loc_comp->mca_component_name);
/*
* Start at the beginning...
*/
jdata->num_procs = 0;
/*
* Process the command line arguments
*/
RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS);
if( ORTE_SUCCESS != (ret = orte_rmaps_lama_process_params(jdata)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS);
/*
* Actually map the job
*/
if( ORTE_SUCCESS != (ret = orte_rmaps_lama_map_core(jdata)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* All Done
*/
RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_TOTAL);
RMAPS_LAMA_DISPLAY_TIMERS();
cleanup:
if( NULL != lama_mapping_layout ) {
free(lama_mapping_layout);
lama_mapping_layout = NULL;
}
if( NULL != lama_mapping_layout_sort ) {
free(lama_mapping_layout_sort);
lama_mapping_layout_sort = NULL;
}
if( NULL != lama_mppr_levels ) {
free(lama_mppr_levels);
lama_mppr_levels = NULL;
}
return exit_status;
}
/*********************************
* User defined lookup structure for hwloc topology
*********************************/
static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item)
{
item->node_mppr = OBJ_NEW(opal_pointer_array_t);
opal_pointer_array_init(item->node_mppr,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE);
}
static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item)
{
orte_std_cntr_t i;
if( NULL != item->node_mppr ) {
for(i = 0; i < item->node_mppr->size; ++i) {
if( NULL != item->node_mppr->addr[i] ) {
OBJ_RELEASE(item->node_mppr->addr[i]);
item->node_mppr->addr[i] = NULL;
}
}
OBJ_RELEASE(item->node_mppr);
item->node_mppr = NULL;
}
}
/*********************************
* Command line parameter parsing functions
*********************************/
static int orte_rmaps_lama_process_params(orte_job_t *jdata)
{
int ret, i;
char *type_str = NULL;
/*
* Process map/bind/order/mppr aliases
*/
if( ORTE_SUCCESS != (ret = rmaps_lama_process_alias_params(jdata) ) ) {
opal_output(0, "mca:rmaps:lama: ERROR: Failed while processing aliases");
return ret;
}
/*
* Parse: Binding
*/
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ----- Binding : [%s]",
rmaps_lama_cmd_bind);
if( ORTE_SUCCESS != (ret = rmaps_lama_parse_binding(rmaps_lama_cmd_bind,
&lama_binding_level,
&lama_binding_num_levels)) ) {
opal_output(0, "mca:rmaps:lama: ERROR: Invalid Binding String: %s",
rmaps_lama_cmd_bind);
return ret;
}
if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) {
type_str = lama_type_enum_to_str(lama_binding_level);
opal_output_verbose(10, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ----- Binding : %*d x %10s",
MAX_BIND_DIGIT_LEN, lama_binding_num_levels, type_str);
free(type_str);
type_str = NULL;
}
/* Reset the binding option since we are going to do it ourselves */
OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
/*
* Parse: Mapping from Process Layout string
*/
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ----- Mapping : [%s]",
rmaps_lama_cmd_map);
if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mapping(rmaps_lama_cmd_map,
&lama_mapping_layout,
&lama_mapping_layout_sort,
&lama_mapping_num_layouts)) ) {
opal_output(0, "mca:rmaps:lama: ERROR: Invalid Mapping Process Layout: %s",
rmaps_lama_cmd_map);
return ret;
}
if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) {
for( i = 0; i < lama_mapping_num_layouts; ++i ) {
type_str = lama_type_enum_to_str(lama_mapping_layout[i]);
opal_output_verbose(10, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ----- Mapping : (%d) %10s (%d vs %d)",
i, type_str,
lama_mapping_layout[i], lama_mapping_layout_sort[i]);
free(type_str);
type_str = NULL;
}
}
/*
* Parse: MPPR
*/
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ----- MPPR : [%s]",
rmaps_lama_cmd_mppr);
if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mppr(rmaps_lama_cmd_mppr,
&lama_mppr_levels,
&lama_mppr_num_levels)) ) {
opal_output(0, "mca:rmaps:lama: ERROR: Invalid MPPR: %s",
rmaps_lama_cmd_mppr);
return ret;
}
if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) {
for( i = 0; i < lama_mppr_num_levels; ++i ) {
type_str = lama_type_enum_to_str(lama_mppr_levels[i].type);
opal_output_verbose(10, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ----- MPPR : %*d at %10s",
MAX_BIND_DIGIT_LEN, lama_mppr_levels[i].max_resources, type_str);
free(type_str);
type_str = NULL;
}
}
/*
* Parse: Ordering
*/
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ----- Ordering : [%s]",
rmaps_lama_cmd_ordering);
if( ORTE_SUCCESS != (ret = rmaps_lama_parse_ordering(rmaps_lama_cmd_ordering,
&lama_ordering)) ) {
opal_output(0, "mca:rmaps:lama: ERROR: Invalid Ordering Argument: %s",
rmaps_lama_cmd_ordering);
return ret;
}
if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) {
if( LAMA_ORDER_NATURAL == lama_ordering ) {
type_str = strdup("Natural");
}
else if( LAMA_ORDER_SEQ == lama_ordering ) {
type_str = strdup("Sequential");
}
else {
type_str = strdup("Unknown");
}
opal_output_verbose(10, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ----- Ordering : %10s",
type_str);
free(type_str);
type_str = NULL;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
return ORTE_SUCCESS;
}
/*********************************
* Support functions
*********************************/
rmaps_lama_level_type_t lama_type_str_to_enum(char *param)
{
if( 0 == strncmp(param, "n", strlen("n")) ) {
return LAMA_LEVEL_MACHINE;
}
else if( 0 == strncmp(param, "b", strlen("b")) ) {
return LAMA_LEVEL_BOARD;
}
else if( 0 == strncmp(param, "s", strlen("s")) ) {
return LAMA_LEVEL_SOCKET;
}
else if( 0 == strncmp(param, "c", strlen("c")) ) {
return LAMA_LEVEL_CORE;
}
else if( 0 == strncmp(param, "h", strlen("h")) ) {
return LAMA_LEVEL_PU;
}
else if( 0 == strncmp(param, "L1", strlen("L1")) ) {
return LAMA_LEVEL_CACHE_L1;
}
else if( 0 == strncmp(param, "L2", strlen("L2")) ) {
return LAMA_LEVEL_CACHE_L2;
}
else if( 0 == strncmp(param, "L3", strlen("L3")) ) {
return LAMA_LEVEL_CACHE_L3;
}
else if( 0 == strncmp(param, "N", strlen("N")) ) {
return LAMA_LEVEL_NUMA;
}
return LAMA_LEVEL_UNKNOWN;
}
char * lama_type_enum_to_str(rmaps_lama_level_type_t param)
{
if( LAMA_LEVEL_MACHINE == param ) {
return strdup("Machine");
}
else if( LAMA_LEVEL_BOARD == param ) {
return strdup("Board");
}
else if( LAMA_LEVEL_SOCKET == param ) {
return strdup("Socket");
}
else if( LAMA_LEVEL_CORE == param ) {
return strdup("Core");
}
else if( LAMA_LEVEL_PU == param ) {
return strdup("Hw. Thread");
}
else if( LAMA_LEVEL_CACHE_L1 == param ) {
return strdup("L1 Cache");
}
else if( LAMA_LEVEL_CACHE_L2 == param ) {
return strdup("L2 Cache");
}
else if( LAMA_LEVEL_CACHE_L3 == param ) {
return strdup("L3 Cache");
}
else if( LAMA_LEVEL_NUMA == param ) {
return strdup("NUMA");
}
return strdup("Unknown");
}
/*********************************
* Core Mapper function
*********************************/
static int orte_rmaps_lama_map_core(orte_job_t *jdata)
{
int ret, exit_status = ORTE_SUCCESS;
int cur_app_idx = 0;
int num_slots, num_nodes;
orte_app_context_t *cur_app_context = NULL;
orte_node_t *cur_mach = NULL;
orte_node_t **cur_mach_ptr = NULL;
orte_proc_t *proc = NULL;
opal_list_t *node_list = NULL;
opal_list_item_t *item = NULL;
opal_tree_t *max_tree = NULL;
int *pu_idx_ref = NULL;
int *last_pu_idx_ref = NULL;
int i, num_mapped, last_num_mapped, mach_level = -1;
orte_std_cntr_t j;
int max_procs_to_map;
int iter_passes;
char * last_level_str = NULL;
bool initial_map = true;
/*
* Setup PU reference
* Find the position of the 'machine'
*/
pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts);
last_pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts);
for( i = 0; i < lama_mapping_num_layouts; ++i ) {
pu_idx_ref[i] = 0;
last_pu_idx_ref[i] = -1;
if( LAMA_LEVEL_MACHINE == lama_mapping_layout[i] ) {
mach_level = i;
}
}
/*
* Foreach app context
*/
for(cur_app_idx = 0; cur_app_idx < jdata->apps->size; ++cur_app_idx ) {
if( NULL == (cur_app_context = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, cur_app_idx))) {
continue;
}
/*
* Get the list of nodes for this app_context.
*/
node_list = OBJ_NEW(opal_list_t);
ret = orte_rmaps_base_get_target_nodes(node_list,
&num_slots,
cur_app_context,
jdata->map->mapping,
initial_map, false);
if(ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(node_list);
/* Flag that all subsequent requests should not reset the node->mapped flag */
initial_map = false;
/*
* If a bookmark exists from some prior mapping, then start from there
*/
cur_mach = (orte_node_t*)orte_rmaps_base_get_starting_point(node_list, jdata);
/*
* If the application did not specify the number of procs
* then set it to the number of 'slots'
* JJH: TODO: Revisit 'max_procs' calculation
*/
if (0 == cur_app_context->num_procs) {
cur_app_context->num_procs = num_slots;
}
max_procs_to_map = cur_app_context->num_procs;
/*
* Build the Max Tree
*/
RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE);
max_tree = rmaps_lama_create_empty_max_tree();
if( ORTE_SUCCESS != (ret = rmaps_lama_build_max_tree(jdata, node_list,
max_tree,
&lama_mppr_max_tree_homogeneous_system)) ) {
exit_status = ret;
goto cleanup;
}
RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Mapping: -----------------------");
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_MAPPING);
/*
* Clear PU reference
*/
for( i = 0; i < lama_mapping_num_layouts; ++i ) {
pu_idx_ref[i] = 0;
}
/*
* Mapping: Recursively loop over all levels
*/
num_mapped = 0;
last_num_mapped = 0;
iter_passes = 0;
cur_mach_ptr = (orte_node_t**)malloc(sizeof(orte_node_t*));
*cur_mach_ptr = cur_mach;
while( max_procs_to_map > num_mapped ) {
ret = rmaps_lama_map_core_iter_level(jdata,
cur_app_context,
node_list,
cur_mach_ptr,
max_tree,
lama_mapping_num_layouts-1,
mach_level,
&pu_idx_ref,
&last_pu_idx_ref,
&num_mapped,
max_procs_to_map,
&iter_passes);
if( ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/*
* We only get here (without finishing the mapping) if we are going to
* start oversubscribing resources.
*/
if( max_procs_to_map > num_mapped ) {
if( !rmaps_lama_can_oversubscribe ) {
orte_show_help("help-orte-rmaps-lama.txt",
"orte-rmaps-lama:oversubscribe",
true,
num_mapped, max_procs_to_map);
exit_status = ORTE_ERROR;
goto cleanup;
} else {
rmaps_lama_am_oversubscribing = true;
}
}
/*
* Check to see if we have made any progress in the mapping loop
*/
if( 0 < cur_app_idx && 2 == iter_passes ) {
/*
* Give it another pass:
* This is an edge case when we are trying to restart from a
* bookmark left by a previous app context. If this app context
* is starting from exactly the beginning of the allocation
* then the recursive loop could return out here after the
* increment pass. This is indicated by (iter_passes = 2).
* Since no processes were mapped, we just try again.
*/
}
else if( last_num_mapped == num_mapped ) {
orte_show_help("help-orte-rmaps-lama.txt",
"orte-rmaps-lama:no-resources-available",
true,
cur_app_idx,
num_mapped, max_procs_to_map,
(NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map),
(NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind),
(NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr),
(NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering));
exit_status = ORTE_ERROR;
goto cleanup;
} else {
last_num_mapped = num_mapped;
}
}
/*
* Display Bookmark for debugging
*/
last_level_str = pu_ref_to_str(last_pu_idx_ref, lama_mapping_num_layouts);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Bookmark: --> Node %10s PU %10s",
jdata->bookmark->name, last_level_str);
free(last_level_str);
last_level_str = NULL;
/*
* Clenup for next iteration
*/
if( NULL != node_list ) {
while(NULL != (item = opal_list_remove_first(node_list))) {
OBJ_RELEASE(item);
}
OBJ_RELEASE(node_list);
node_list = NULL;
}
OBJ_RELEASE(max_tree);
max_tree = NULL;
}
RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_MAPPING);
/*
* Ordering
*/
RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_ORDERING);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
if( LAMA_ORDER_SEQ == lama_ordering ) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Ordering: Sequential ------------");
if( ORTE_SUCCESS != (ret = rmaps_lama_ordering_sequential(jdata)) ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
}
else {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Ordering: Natural ---------------");
#if 0
/*
* We compute our own vpids inline with the algorithm. So no need to use the
* orte_rmaps_base_compute_vpids() function.
*/
#endif
}
RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_ORDERING);
/*
* Display Mapping
*/
if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
for( j = 0; j < jdata->procs->size; ++j) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Ordering: Proc. %2d on Node %10s - Slot %s",
proc->name.vpid, proc->node->name, proc->cpu_bitmap);
}
}
/*
* All done
*/
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Finished ------------------------");
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
cleanup:
if( NULL != node_list ) {
while(NULL != (item = opal_list_remove_first(node_list))) {
OBJ_RELEASE(item);
}
OBJ_RELEASE(node_list);
node_list = NULL;
}
if( NULL != max_tree ) {
OBJ_RELEASE(max_tree);
max_tree = NULL;
}
if( NULL != pu_idx_ref ) {
free(pu_idx_ref);
pu_idx_ref = NULL;
}
if( NULL != last_level_str ) {
free(last_level_str);
last_level_str = NULL;
}
return exit_status;
}
static int rmaps_lama_map_core_iter_level(orte_job_t *jdata,
orte_app_context_t *cur_app_context,
opal_list_t *node_list,
orte_node_t **cur_mach_ptr,
opal_tree_t *max_tree,
int cur_level,
int mach_level,
int **pu_idx_ref,
int **last_pu_idx_ref,
int *num_mapped,
int max_procs,
int *iter_passes)
{
int ret, exit_status = ORTE_SUCCESS;
int i, j;
opal_tree_item_t *tree_for_level = NULL;
int max_subtree_arity = 0;
char * level_str = NULL;
char * last_level_str = NULL;
char * slot_list = NULL;
orte_proc_t *proc = NULL;
int pu_idx = 0;
/*
* Find the current tree for this level
* If it is the machine level, then we need to access the information from
* the node list, not the max_tree.
*/
if( cur_level != mach_level ) {
tree_for_level = opal_tree_find_with(opal_tree_get_root(max_tree),
&lama_mapping_layout[cur_level]);
/*
* We do not need subtree, but the arity of the subtree
* JJH TODO: This should be an opal_tree function.
*/
max_subtree_arity = 1; /* include self */
while( NULL != (tree_for_level = opal_tree_get_next_sibling(tree_for_level)) ) {
++max_subtree_arity;
}
}
else if( NULL == *cur_mach_ptr ) {
*cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr));
}
pu_idx = convert_layer_to_sort_idx(lama_mapping_layout[cur_level]);
level_str = lama_type_enum_to_str(lama_mapping_layout[cur_level]);
/*
* Do we need to advance to a bookmark
*/
if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) {
/*
* Display last mapped
*/
last_level_str = pu_ref_to_str(*last_pu_idx_ref, lama_mapping_num_layouts);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Bookmark: --> Last Mapped: Node %10s (bkmrk %10s) PU %10s - Level %2d",
(NULL == *cur_mach_ptr ? "(NULL)" : (*cur_mach_ptr)->name),
jdata->bookmark->name, last_level_str, (*last_pu_idx_ref)[pu_idx]);
free(last_level_str);
last_level_str = NULL;
/*
* Set the level starting point to the last known index
*/
i = (*last_pu_idx_ref)[pu_idx];
} else {
i = 0;
}
/*
* Loop over all siblings at this level
* Initial condition above, Increment at bottom, Break check at bottom
*/
while( 1 ) {
/*
* Define the PU index
*/
(*pu_idx_ref)[pu_idx] = i;
if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s - Increment only",
cur_level+1,
level_str, pu_idx, i, max_subtree_arity,
(NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name));
} else {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s",
cur_level+1,
level_str, pu_idx, i, max_subtree_arity,
(NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name));
}
/*
* If not the inner most loop, iterate to the next level down
*/
if( cur_level > 0 ) {
ret = rmaps_lama_map_core_iter_level(jdata,
cur_app_context,
node_list,
cur_mach_ptr,
max_tree,
cur_level - 1,
mach_level,
pu_idx_ref,
last_pu_idx_ref,
num_mapped,
max_procs,
iter_passes);
if( ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
}
/*
* If we are restarting the iteration from a previous bookmark then
* the first pass through is a no-op mapping pass that just increments
* the PU reference.
* Called by innermost loop
*/
else if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) {
*iter_passes += 1;
}
/*
* Try to map at this location
*/
else {
/*
* On first pass, make sure we increment this, just so we do not
* accidentally think this is an increment pass.
*/
if( 0 == *iter_passes ) {
*iter_passes += 1;
}
/*
* Display the PU ref for debugging
*/
display_pu_ref(*pu_idx_ref, lama_mapping_num_layouts, *num_mapped, proc);
/*
* Check to see if this resource is available on this node.
*
* In a heterogeneous or otherwise non-uniformly restricted
* environment we may iterate to a resource that is not
* available either because it does not exist, or is not
* available for allocation (off-lined, sub-node allocation).
* Additionally, we need to check resource constrains expressed
* in the MPPR and binding.
*/
ret = check_node_availability((*cur_mach_ptr),
max_tree,
*pu_idx_ref,
&slot_list);
if( ORTE_SUCCESS != ret || NULL == slot_list ) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:hwtopo: Mapping: --> Level %2d: %s - INVALID/SKIP",
cur_level+1,
level_str);
/*
* By not mapping here we just let the iterations continue
* until a suitable match is found or we have exhausted all
* possible locations to match and thus cannot map any more.
*/
}
else {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Mapping: --> Level %2d: %s - Slot List (%s)",
cur_level+1,
level_str, slot_list);
/*
* Map this process onto the resource specified
* level_tree_objs[*] and cur_mach point to the specific resource
*/
proc = NULL;
ret = orte_rmaps_lama_map_process(jdata,
(*cur_mach_ptr),
cur_app_context->idx,
&proc);
if( ORTE_SUCCESS != ret ) {
ORTE_ERROR_LOG(ret);
return ret;
}
/*
* Set the binding for this process
*/
proc->cpu_bitmap = strdup(slot_list);
/** JJH: Need to associate with an HWLOC object... hummm.... */
proc->locale = NULL;
/* proc->locale = obj; */
/*
* Insert the proc into the 'native' ordering location.
*/
proc->name.vpid = jdata->num_procs;
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(jdata->procs,
proc->name.vpid, proc))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
jdata->num_procs += 1;
/*
* Save a bookmark so we can return here later if necessary
*/
for( j = 0; j < lama_mapping_num_layouts; ++j ) {
(*last_pu_idx_ref)[j] = (*pu_idx_ref)[j];
}
jdata->bookmark = (orte_node_t*)(*cur_mach_ptr);
(*num_mapped)++;
}
}
/*
* Increment loop
*
* If we are binding, then we may need to advance the binding layer
* by more than one.
*/
if( cur_level != mach_level ) {
if( lama_binding_level == lama_mapping_layout[cur_level] ) {
i += lama_binding_num_levels;
} else {
++i;
}
} else {
/*
* Note: Currently we do not allow for 'binding' to multiple machines
* But keep the code just in case we want to play with 'stride' later
*/
if( lama_binding_level == lama_mapping_layout[cur_level] && lama_binding_num_levels > 1) {
opal_output(0, "mca:rmaps:lama: ERROR: Cannot bind to multiple machines - SHOULD NEVER HAPPEN: %s",
rmaps_lama_cmd_bind);
return ORTE_ERROR;
#if 0
for( j = 0; j < lama_binding_num_levels; ++j ) {
cur_mach = get_next_machine(jdata, node_list, (opal_list_item_t*)cur_mach);
if( NULL == cur_mach ) {
break;
}
++i;
}
#endif
} else {
*cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr));
++i;
}
}
/*
* Check if we are done mapping before iterating again
*/
if( max_procs <= *num_mapped ) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/*
* Check if we are done looping
*/
if( cur_level != mach_level ) {
if( i >= max_subtree_arity ) {
break;
}
} else {
if( NULL == *cur_mach_ptr ) {
break;
}
}
}
/*
* Sanity Check: Check if we are done mapping
*/
if( max_procs <= *num_mapped ) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
cleanup:
/*
* If the outermost layer, the increment the number of iteration passes.
*/
if( cur_level == lama_mapping_num_layouts-1 ) {
*iter_passes += 1;
}
if( NULL != level_str ) {
free(level_str);
level_str = NULL;
}
if( NULL != slot_list ) {
free(slot_list);
slot_list = NULL;
}
return exit_status;
}
static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list,
opal_list_item_t *cur_mach)
{
orte_node_t *next_mach = NULL;
if( NULL == cur_mach ) {
next_mach = (orte_node_t*)opal_list_get_first(node_list);
}
else if( opal_list_get_last(node_list) == cur_mach ) {
next_mach = NULL;
}
else {
next_mach = (orte_node_t*)opal_list_get_next(cur_mach);
}
return next_mach;
}
static int orte_rmaps_lama_map_process(orte_job_t *jdata,
orte_node_t *node,
int app_idx,
orte_proc_t **proc)
{
int ret;
/*
* Add this node to the map, but only once
*/
if( !node->mapped ) {
if (ORTE_SUCCESS > (ret = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
ORTE_ERROR_LOG(ret);
return ret;
}
node->mapped = true;
OBJ_RETAIN(node); /* maintain accounting on object */
++(jdata->map->num_nodes);
}
/*
* Setup the process object
*/
if (NULL == (*proc = orte_rmaps_base_setup_proc(jdata, node, app_idx))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
return ORTE_SUCCESS;
}
static int rmaps_lama_ordering_sequential(orte_job_t *jdata)
{
orte_job_map_t *map;
orte_proc_t *proc = NULL, *swap = NULL;
orte_std_cntr_t i, j;
int cur_rank = 0;
orte_node_t *cur_node = NULL;
map = jdata->map;
opal_output_verbose(15, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
/*
* Assign the ranks sequentially
*/
for( i = 0; i < map->nodes->size; ++i) {
if (NULL == (cur_node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
continue;
}
for( j = 0; j < cur_node->procs->size; ++j) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(cur_node->procs, j))) {
continue;
}
/* ignore procs from other jobs */
if (proc->name.jobid != jdata->jobid) {
continue;
}
opal_output_verbose(15, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Ordering: Rename Proc. %2d to %2d (Rev. %s)",
proc->name.vpid, cur_rank, proc->node->name);
proc->name.vpid = cur_rank;
++cur_rank;
}
}
/*
* Fix the job structure ordering - Sort by new vpid
*
* If we do not do this then the remote daemons assign the incorrect
* ranks to the processes since they use the relative ordering in the
* jdata->procs structure to determine vpids locally.
*
* JJH: Look at combining these loops with the loop in the core so we
* JJH: do not have to iterate over the list two times
*/
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
cur_rank = 0;
for( j = 0; j < jdata->procs->size; ++j) {
if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
continue;
}
opal_output_verbose(15, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Ordering: Proc. %2d on Node %s",
proc->name.vpid, proc->node->name);
while((int)proc->name.vpid != cur_rank ) {
swap = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid);
opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
opal_pointer_array_set_item(jdata->procs, cur_rank, swap);
opal_output_verbose(15, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Ordering: \t SWAP Proc. %2d (%d) and Proc. %2d (%d)",
proc->name.vpid, cur_rank, swap->name.vpid, proc->name.vpid);
proc = swap;
}
++cur_rank;
}
return ORTE_SUCCESS;
}
static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer)
{
int i;
for(i = 0; i < lama_mapping_num_layouts; ++i ) {
if( lama_mapping_layout_sort[i] == layer ) {
return i;
}
}
return 0;
}
static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc)
{
char *str = NULL;
str = pu_ref_to_str(ref, size);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Mapping: PU Ref: %s [Rank %2d] Name: %s",
str, rank,
(NULL == proc ? "(null)" : ORTE_NAME_PRINT(&proc->name)));
free(str);
return;
}
static char * pu_ref_to_str(int *ref, int size)
{
int i, idx;
char *str = NULL;
str = (char *)malloc(sizeof(char) * (2 * size));
for(i = 0, idx = 0; i < size; ++i, idx += 2) {
sprintf(&(str[idx]), "%2d", ref[i]);
}
return str;
}
static int check_node_availability(orte_node_t *cur_node,
opal_tree_t *max_tree,
int *pu_idx_ref,
char **slot_list)
{
int exit_status = ORTE_SUCCESS;
int i;
char * level_str = NULL;
hwloc_obj_t *topo_child = NULL, *topo_parent=NULL;
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Checking: Node (%s) -------------",
cur_node->name);
opal_output_verbose(11, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: ---------------------------------");
/*
* Determine if the current node has the necessary hardware
* as described by the PU index.
* Find the hwloc object reference for the resource pointed to
* by the PU index.
* JJH TODO: If homogeneous system then this could be simplified.
*/
topo_parent = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1);
*topo_parent = hwloc_get_obj_by_depth(cur_node->topology, 0, 0);
for( i = 0; i < lama_mapping_num_layouts; ++i ) {
/*
* Skip 'machine' level
*/
if( LAMA_LEVEL_MACHINE == lama_mapping_layout_sort[i] ) {
continue;
}
/*
* Skip 'board' level
* JJH: HWLOC does not support BOARD at the moment
*/
if( LAMA_LEVEL_BOARD == lama_mapping_layout_sort[i] ) {
continue;
}
level_str = lama_type_enum_to_str(lama_mapping_layout_sort[i]);
opal_output_verbose(11, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Checking: %2d of %s",
pu_idx_ref[i], level_str);
/*
* Find the nth subtree matching the current key
*/
topo_child = rmaps_lama_find_nth_subtree_match(cur_node->topology,
*topo_parent,
pu_idx_ref[i],
lama_mapping_layout_sort[i]);
/*
* If it does not exist, then this node is not capable of matching
* so it is unavailable.
*/
if( NULL == topo_child ) {
opal_output_verbose(11, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Check failed: Node %s does not have a %10s %2d",
cur_node->name, level_str, pu_idx_ref[i]);
exit_status = ORTE_ERROR;
goto cleanup;
}
/*
* Keep decending the tree
*/
topo_parent = topo_child;
free(level_str);
level_str = NULL;
}
/*
* We have sufficient hardware :)
*/
/*
* Return the native slot list to bind to
* Internally checks the MPPR
*/
*slot_list = get_native_slot_list(cur_node, topo_parent, pu_idx_ref);
if( NULL == *slot_list ) {
goto cleanup;
}
cleanup:
if( NULL != level_str ) {
free(level_str);
level_str = NULL;
}
if( ORTE_SUCCESS != exit_status ) {
if( NULL != *slot_list ) {
free(*slot_list);
*slot_list = NULL;
}
}
return exit_status;
}
static int rmaps_lama_check_mppr(orte_node_t *node,
hwloc_obj_t *child_obj)
{
int ret;
/*
* Optimization if no MPPR provided
*/
if( NULL == lama_mppr_levels ) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: No MPPR to check - Skip...");
return ORTE_SUCCESS;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Check ---------------------------");
/*
* Check Parents (excluding self)
*/
if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, true)) ) {
return ret;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Check ---------------------------");
/*
* Check Children (including self)
*/
if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, true)) ) {
return ret;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Check ---------------------------");
return ORTE_SUCCESS;
}
static int rmaps_lama_inc_mppr(orte_node_t *node,
hwloc_obj_t *child_obj)
{
int ret;
/*
* Optimization if no MPPR provided
*/
if( NULL == lama_mppr_levels ) {
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: No MPPR to increment - Skip...");
return ORTE_SUCCESS;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Inc ---------------------------");
/*
* Increment Parents (excluding self)
*/
if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, false)) ) {
return ret;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Inc ---------------------------");
/*
* Increment Children (including self)
*/
if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, false)) ) {
return ret;
}
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Inc ---------------------------");
return ORTE_SUCCESS;
}
static int rmaps_lama_iter_mppr_parents(orte_node_t *node,
hwloc_obj_t *child_obj,
bool check_only)
{
rmaps_lama_hwloc_user_t *hwloc_userdata = NULL;
rmaps_lama_node_mppr_t *mppr_accounting = NULL;
char str[128];
/*
* Basecase
*/
if( NULL == *child_obj ) {
return ORTE_SUCCESS;
}
/*
* Check self
*/
/*
* Access MPPR info for this object
*/
hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata;
mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index);
hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: %s: P [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)",
(check_only ? "Checking " : "Increment"),
node->index, node->name, str,
mppr_accounting->max,
(check_only ? mppr_accounting->cur : mppr_accounting->cur + 1),
(rmaps_lama_am_oversubscribing ? "T" : "F"),
(rmaps_lama_can_oversubscribe ? "T" : "F") );
/*
* Check limits - Error on first to exceed
*/
if( check_only ) {
if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) {
if( (mppr_accounting->cur)+1 > mppr_accounting->max ) {
return ORTE_ERROR;
}
}
}
/*
* Increment current number allocated below this level
*/
else {
mppr_accounting->cur += 1;
}
/*
* Go to parent
*/
return rmaps_lama_iter_mppr_parents(node, &((*child_obj)->parent), check_only);
}
static int rmaps_lama_iter_mppr_children(orte_node_t *node,
hwloc_obj_t *child_obj,
bool check_only)
{
int ret;
rmaps_lama_hwloc_user_t *hwloc_userdata = NULL;
rmaps_lama_node_mppr_t *mppr_accounting = NULL;
char str[128];
int i;
/*
* Check self
*/
/*
* Access MPPR info for this object
*/
hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata;
mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index);
hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0);
opal_output_verbose(5, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: %s: C [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)",
(check_only ? "Checking " : "Increment"),
node->index, node->name, str,
mppr_accounting->max,
(check_only ? mppr_accounting->cur : mppr_accounting->cur + 1),
(rmaps_lama_am_oversubscribing ? "T" : "F"),
(rmaps_lama_can_oversubscribe ? "T" : "F") );
/*
* Check limits - Error on first to exceed
*/
if( check_only ) {
if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) {
if( (mppr_accounting->cur)+1 > mppr_accounting->max ) {
return ORTE_ERROR;
}
}
}
/*
* Increment current number allocated below this level
*/
else {
mppr_accounting->cur += 1;
}
/*
* Check all children
*/
for(i = 0; i < (int)(*child_obj)->arity; ++i ) {
if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, &((*child_obj)->children[i]), check_only)) ) {
return ret;
}
}
return ORTE_SUCCESS;
}
static char * get_native_slot_list(orte_node_t *cur_node, hwloc_obj_t *pu_obj, int *put_idx_ref)
{
int i;
char *slot_list = NULL;
hwloc_obj_t *binding_parent = NULL;
hwloc_obj_t *cur_parent = NULL;
hwloc_cpuset_t binding_cpuset;
hwloc_cpuset_t scratch_cpuset;
char *type_str = NULL;
/*
* Sanity check
*/
if( NULL == pu_obj ) {
return NULL;
}
/*
* Determine the cpumask to send to the backend for binding
*/
/*
* Iterate up the tree until we reach the binding parent
*/
binding_parent = rmaps_lama_find_parent(cur_node->topology, pu_obj, lama_binding_level);
if( NULL == binding_parent ) {
return NULL;
}
/*
* Iterate across cousins until we find enough resources or hit the node boundary
*/
binding_cpuset = hwloc_bitmap_alloc();
hwloc_bitmap_zero(binding_cpuset);
scratch_cpuset = hwloc_bitmap_alloc();
cur_parent = binding_parent;
for(i = 0; i < lama_binding_num_levels; ++i) {
/*
* Check MPPR Availability
*/
if( ORTE_SUCCESS != rmaps_lama_check_mppr(cur_node, cur_parent) ) {
goto cleanup;
}
/*
* Accumulate the bitmask
*
* JJH: TODO: Add resource offline check (?)
*/
hwloc_bitmap_zero(scratch_cpuset);
/* JJH: Maybe use opal_hwloc_base_get_available_cpus(cur_node->topology, (*cur_parent)) ?
* They do pretty much the same thing, but with more checks...
*/
hwloc_bitmap_and(scratch_cpuset, (*cur_parent)->allowed_cpuset, (*cur_parent)->online_cpuset);
hwloc_bitmap_or(binding_cpuset, scratch_cpuset, binding_cpuset);
#if 0
{
hwloc_obj_snprintf(str, sizeof(str), cur_node->topology, *cur_parent, "#", 0);
printf("--> BINDING TO -- %-20s \t -- %2d of %2d -- %2d vs %2d\n",str,
i, lama_binding_level,
(*binding_parent)->logical_index, (*cur_parent)->logical_index);
hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->allowed_cpuset );
printf("--> CPU A : %-20s\n", str);
hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->online_cpuset );
printf("--> CPU B : %-20s\n", str);
hwloc_bitmap_snprintf(str, sizeof(str), scratch_cpuset);
printf("--> CPU C : %-20s\n", str);
hwloc_bitmap_snprintf(str, sizeof(str), binding_cpuset);
printf("--> CPU D : %-20s\n", str);
}
#endif
/*
* Iterate to the next cousin.
* If we exceed the boundary of the node, then send up an error.
*/
if( (i+1) < lama_binding_num_levels && NULL == (*cur_parent)->next_cousin ) {
type_str = lama_type_enum_to_str(lama_binding_level);
opal_output_verbose(10, orte_rmaps_base.rmaps_output,
"mca:rmaps:lama: Error: Not able to bind to %*d x %10s - Stopped at %*d",
MAX_BIND_DIGIT_LEN, lama_binding_num_levels,
type_str,
MAX_BIND_DIGIT_LEN, i);
free(type_str);
type_str = NULL;
goto cleanup;
}
/*
* Point to the next cousin
*/
if( NULL != (*cur_parent)->next_cousin ) {
cur_parent = &((*cur_parent)->next_cousin);
}
}
/*
* Account for the process placement in the MPPR
* Assumes a previous check
* We cannot do this in the loop, since if the MPPR check fails we would
* need to roll back previous increments.
*/
cur_parent = binding_parent;
for(i = 0; i < lama_binding_num_levels; ++i) {
/*
* Account for the process placement in the MPPR
* Assumes a previous check.
*/
if( ORTE_SUCCESS != rmaps_lama_inc_mppr(cur_node, cur_parent) ) {
goto cleanup;
}
/*
* Point to the next cousin
*/
if( NULL != (*cur_parent)->next_cousin ) {
cur_parent = &((*cur_parent)->next_cousin);
}
}
/*
* Convert the cpuset to a slot_list for the remote daemon
*/
hwloc_bitmap_list_asprintf(&slot_list, binding_cpuset);
cleanup:
hwloc_bitmap_free(scratch_cpuset);
hwloc_bitmap_free(binding_cpuset);
return slot_list;
}
/*********************************
* Timer Support
*********************************/
static double rmaps_lama_get_time(void)
{
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
static void rmaps_lama_set_time(int idx, bool is_start)
{
if(idx < RMAPS_LAMA_TIMER_MAX ) {
if( is_start ) {
timer_start[idx] = rmaps_lama_get_time();
} else {
timer_end[idx] = rmaps_lama_get_time();
timer_accum[idx] += timer_end[idx] - timer_start[idx];
}
}
}
static void rmaps_lama_display_all_timers(void)
{
double diff = 0.0;
double total = 0.0;
char * label = NULL;
opal_output(0,
"mca:rmaps:lama: Timing: ---------------------------\n");
/*
* Timer: Parse Parameters
*/
label = strdup("Parse Params");
diff = timer_accum[RMAPS_LAMA_TIMER_PARSE_PARAMS];
rmaps_lama_display_indv_timer_core(diff, label);
free(label);
total += diff;
/*
* Timer: Build Max Tree
*/
label = strdup("Build Max Tree");
diff = timer_accum[RMAPS_LAMA_TIMER_BUILD_MAX_TREE];
rmaps_lama_display_indv_timer_core(diff, label);
free(label);
total += diff;
/*
* Timer: Mapping
*/
label = strdup("Mapping");
diff = timer_accum[RMAPS_LAMA_TIMER_MAPPING];
rmaps_lama_display_indv_timer_core(diff, label);
free(label);
total += diff;
/*
* Timer: Ordering
*/
label = strdup("Ordering");
diff = timer_accum[RMAPS_LAMA_TIMER_ORDERING];
rmaps_lama_display_indv_timer_core(diff, label);
free(label);
total += diff;
/*
* Timer: Total Overhead
*/
label = strdup("Other Overhead");
diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL];
rmaps_lama_display_indv_timer_core(diff - total, label);
free(label);
/*
* Timer: Total
*/
label = strdup("Total");
diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL];
rmaps_lama_display_indv_timer_core(diff, label);
free(label);
opal_output(0,
"mca:rmaps:lama: ---------------------------------");
}
static void rmaps_lama_clear_timers(void)
{
int i;
for(i = 0; i < RMAPS_LAMA_TIMER_MAX; ++i) {
timer_start[i] = 0.0;
timer_end[i] = 0.0;
timer_accum[i] = 0.0;
}
}
static void rmaps_lama_display_indv_timer_core(double diff, char *str)
{
double perc = 0;
double total = 0;
total = timer_end[RMAPS_LAMA_TIMER_TOTAL] - timer_start[RMAPS_LAMA_TIMER_TOTAL];
perc = (diff/total) * 100;
opal_output(0,
"mca:rmaps:lama: \t%-20s = %10.2f ms\t%6.2f %s\n",
str, (diff * 1000), perc, "%");
return;
}