/* * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "orte_config.h" #include "orte/constants.h" #include "orte/types.h" #include #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ #ifdef HAVE_STRING_H #include #endif /* HAVE_STRING_H */ #include "opal/mca/base/mca_base_param.h" #include "opal/mca/hwloc/hwloc.h" #include "opal/util/argv.h" #include "opal/class/opal_tree.h" #include "orte/util/show_help.h" #include "orte/util/error_strings.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rmaps/base/rmaps_private.h" #include "orte/mca/rmaps/base/base.h" #include "orte/runtime/orte_globals.h" #include "rmaps_lama.h" #include MCA_timer_IMPLEMENTATION_HEADER /********************************* * Module setup *********************************/ static int orte_rmaps_lama_map(orte_job_t *jdata); orte_rmaps_base_module_t orte_rmaps_lama_module = { orte_rmaps_lama_map }; /********************************* * Timer *********************************/ #define RMAPS_LAMA_TIMER_TOTAL 0 #define RMAPS_LAMA_TIMER_PARSE_PARAMS 1 #define RMAPS_LAMA_TIMER_BUILD_MAX_TREE 2 #define RMAPS_LAMA_TIMER_MAPPING 3 #define RMAPS_LAMA_TIMER_ORDERING 4 #define RMAPS_LAMA_TIMER_MAX 5 static double rmaps_lama_get_time(void); static void rmaps_lama_set_time(int idx, bool is_start); static void rmaps_lama_display_all_timers(void); static void rmaps_lama_clear_timers(void); static void rmaps_lama_display_indv_timer_core(double diff, char *str); static double timer_start[RMAPS_LAMA_TIMER_MAX]; static double timer_end[RMAPS_LAMA_TIMER_MAX]; static double timer_accum[RMAPS_LAMA_TIMER_MAX]; #define RMAPS_LAMA_CLEAR_TIMERS() \ { \ if( rmaps_lama_timing_enabled ) { \ rmaps_lama_clear_timers(); \ } \ } #define RMAPS_LAMA_START_TIMER(idx) \ { \ if( rmaps_lama_timing_enabled ) { \ rmaps_lama_set_time(idx, true); \ } \ } #define RMAPS_LAMA_END_TIMER(idx) \ { \ if( rmaps_lama_timing_enabled ) { \ rmaps_lama_set_time(idx, false); \ } \ } #define RMAPS_LAMA_DISPLAY_TIMERS() \ { \ if( rmaps_lama_timing_enabled ) { \ rmaps_lama_display_all_timers(); \ } \ } /********************************* * Structures & Defines *********************************/ static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item); static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item); OBJ_CLASS_INSTANCE(rmaps_lama_hwloc_user_t, opal_object_t, rmaps_lama_hwloc_user_construct, rmaps_lama_hwloc_user_destruct); /********************************* * Globals *********************************/ /* * Mapping */ rmaps_lama_level_type_t *lama_mapping_layout = NULL; static rmaps_lama_level_type_t *lama_mapping_layout_sort = NULL; int lama_mapping_num_layouts = 0; /* * Binding */ rmaps_lama_level_type_t lama_binding_level = LAMA_LEVEL_UNKNOWN; static int lama_binding_num_levels = 0; /* * MPPR */ rmaps_lama_level_info_t *lama_mppr_levels = NULL; int lama_mppr_num_levels = 0; /* * Ordering */ static rmaps_lama_order_type_t lama_ordering = LAMA_ORDER_NATURAL; /* * Homogeneous system optimization */ bool lama_mppr_max_tree_homogeneous_system = false; /********************************* * Support Macros *********************************/ /********************************* * Support functions *********************************/ /* * Preprocess the command line arguments */ static int orte_rmaps_lama_process_params(orte_job_t *jdata); /* * Mapping Support: * Core mapping function */ static int orte_rmaps_lama_map_core(orte_job_t *jdata); /* * Mapping Support: * Recursive function for mapping process */ static int rmaps_lama_map_core_iter_level(orte_job_t *jdata, orte_app_context_t *cur_app_context, opal_list_t *node_list, orte_node_t **cur_mach_ptr, opal_tree_t *max_tree, int cur_level, int mach_level, int **pu_idx_ref, int **last_pu_idx_ref, int *num_mapped, int max_procs, int *iter_passes); /* * Mapping Support: * Access the next machine in the node list */ static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list, opal_list_item_t *cur_mach); /* * Mapping Support: * Check the availability of the requested slot on the specified node */ static int check_node_availability(orte_node_t *cur_node, opal_tree_t *max_tree, int *pu_idx_ref, char **slot_list); /* * Mapping Support: * Debugging PU display */ static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc); static char * pu_ref_to_str(int *ref, int size); /* * Mapping Support: * Convert the process layout 'layer' to the sorted position for the PU */ static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer); /* * MPPR Support: * Check to make sure a process can be placed on this resource given the * MPPR restrictions. */ static int rmaps_lama_check_mppr(orte_node_t *node, hwloc_obj_t *child_obj); static int rmaps_lama_iter_mppr_parents(orte_node_t *node, hwloc_obj_t *child_obj, bool check_only); static int rmaps_lama_iter_mppr_children(orte_node_t *node, hwloc_obj_t *child_obj, bool check_only); /* * MPPR Support: * Increment parents of this child to account for a process being placed * on this resource. */ static int rmaps_lama_inc_mppr(orte_node_t *node, hwloc_obj_t *child_obj); /* * Mapping Support: * Return the native representation of the slot list */ static char * get_native_slot_list(orte_node_t *cur_node, hwloc_obj_t *pu_obj, int *put_idx_ref); /* * Ordering Support: * Reorder sequentially */ static int rmaps_lama_ordering_sequential(orte_job_t *jdata); /* * Map a single process to a specific node */ static int orte_rmaps_lama_map_process(orte_job_t *jdata, orte_node_t *node, int app_idx, orte_proc_t **proc); /********************************* * Main Module function to map a job *********************************/ static int orte_rmaps_lama_map(orte_job_t *jdata) { int ret, exit_status = ORTE_SUCCESS; mca_base_component_t *loc_comp = &mca_rmaps_lama_component.base_version; RMAPS_LAMA_CLEAR_TIMERS(); RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_TOTAL); /* * Sanity Check: * If we are not the 'chosen' mapper, then exit here */ if (NULL != jdata->map->req_mapper && 0 != strcasecmp(jdata->map->req_mapper, loc_comp->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: job %s not using lama mapper (using %s)", ORTE_JOBID_PRINT(jdata->jobid), jdata->map->req_mapper); return ORTE_ERR_TAKE_NEXT_OPTION; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* * Identify this as the mapper responsible for this job */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(loc_comp->mca_component_name); /* * Start at the beginning... */ jdata->num_procs = 0; /* * Process the command line arguments */ RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS); if( ORTE_SUCCESS != (ret = orte_rmaps_lama_process_params(jdata)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS); /* * Actually map the job */ if( ORTE_SUCCESS != (ret = orte_rmaps_lama_map_core(jdata)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * All Done */ RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_TOTAL); RMAPS_LAMA_DISPLAY_TIMERS(); cleanup: if( NULL != lama_mapping_layout ) { free(lama_mapping_layout); lama_mapping_layout = NULL; } if( NULL != lama_mapping_layout_sort ) { free(lama_mapping_layout_sort); lama_mapping_layout_sort = NULL; } if( NULL != lama_mppr_levels ) { free(lama_mppr_levels); lama_mppr_levels = NULL; } return exit_status; } /********************************* * User defined lookup structure for hwloc topology *********************************/ static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item) { item->node_mppr = OBJ_NEW(opal_pointer_array_t); opal_pointer_array_init(item->node_mppr, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE); } static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item) { orte_std_cntr_t i; if( NULL != item->node_mppr ) { for(i = 0; i < item->node_mppr->size; ++i) { if( NULL != item->node_mppr->addr[i] ) { OBJ_RELEASE(item->node_mppr->addr[i]); item->node_mppr->addr[i] = NULL; } } OBJ_RELEASE(item->node_mppr); item->node_mppr = NULL; } } /********************************* * Command line parameter parsing functions *********************************/ static int orte_rmaps_lama_process_params(orte_job_t *jdata) { int ret, i; char *type_str = NULL; /* * Process map/bind/order/mppr aliases */ if( ORTE_SUCCESS != (ret = rmaps_lama_process_alias_params(jdata) ) ) { opal_output(0, "mca:rmaps:lama: ERROR: Failed while processing aliases"); return ret; } /* * Parse: Binding */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ----- Binding : [%s]", rmaps_lama_cmd_bind); if( ORTE_SUCCESS != (ret = rmaps_lama_parse_binding(rmaps_lama_cmd_bind, &lama_binding_level, &lama_binding_num_levels)) ) { opal_output(0, "mca:rmaps:lama: ERROR: Invalid Binding String: %s", rmaps_lama_cmd_bind); return ret; } if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) { type_str = lama_type_enum_to_str(lama_binding_level); opal_output_verbose(10, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ----- Binding : %*d x %10s", MAX_BIND_DIGIT_LEN, lama_binding_num_levels, type_str); free(type_str); type_str = NULL; } /* Reset the binding option since we are going to do it ourselves */ OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); /* * Parse: Mapping from Process Layout string */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ----- Mapping : [%s]", rmaps_lama_cmd_map); if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mapping(rmaps_lama_cmd_map, &lama_mapping_layout, &lama_mapping_layout_sort, &lama_mapping_num_layouts)) ) { opal_output(0, "mca:rmaps:lama: ERROR: Invalid Mapping Process Layout: %s", rmaps_lama_cmd_map); return ret; } if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) { for( i = 0; i < lama_mapping_num_layouts; ++i ) { type_str = lama_type_enum_to_str(lama_mapping_layout[i]); opal_output_verbose(10, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ----- Mapping : (%d) %10s (%d vs %d)", i, type_str, lama_mapping_layout[i], lama_mapping_layout_sort[i]); free(type_str); type_str = NULL; } } /* * Parse: MPPR */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ----- MPPR : [%s]", rmaps_lama_cmd_mppr); if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mppr(rmaps_lama_cmd_mppr, &lama_mppr_levels, &lama_mppr_num_levels)) ) { opal_output(0, "mca:rmaps:lama: ERROR: Invalid MPPR: %s", rmaps_lama_cmd_mppr); return ret; } if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) { for( i = 0; i < lama_mppr_num_levels; ++i ) { type_str = lama_type_enum_to_str(lama_mppr_levels[i].type); opal_output_verbose(10, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ----- MPPR : %*d at %10s", MAX_BIND_DIGIT_LEN, lama_mppr_levels[i].max_resources, type_str); free(type_str); type_str = NULL; } } /* * Parse: Ordering */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ----- Ordering : [%s]", rmaps_lama_cmd_ordering); if( ORTE_SUCCESS != (ret = rmaps_lama_parse_ordering(rmaps_lama_cmd_ordering, &lama_ordering)) ) { opal_output(0, "mca:rmaps:lama: ERROR: Invalid Ordering Argument: %s", rmaps_lama_cmd_ordering); return ret; } if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) { if( LAMA_ORDER_NATURAL == lama_ordering ) { type_str = strdup("Natural"); } else if( LAMA_ORDER_SEQ == lama_ordering ) { type_str = strdup("Sequential"); } else { type_str = strdup("Unknown"); } opal_output_verbose(10, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ----- Ordering : %10s", type_str); free(type_str); type_str = NULL; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); return ORTE_SUCCESS; } /********************************* * Support functions *********************************/ rmaps_lama_level_type_t lama_type_str_to_enum(char *param) { if( 0 == strncmp(param, "n", strlen("n")) ) { return LAMA_LEVEL_MACHINE; } else if( 0 == strncmp(param, "b", strlen("b")) ) { return LAMA_LEVEL_BOARD; } else if( 0 == strncmp(param, "s", strlen("s")) ) { return LAMA_LEVEL_SOCKET; } else if( 0 == strncmp(param, "c", strlen("c")) ) { return LAMA_LEVEL_CORE; } else if( 0 == strncmp(param, "h", strlen("h")) ) { return LAMA_LEVEL_PU; } else if( 0 == strncmp(param, "L1", strlen("L1")) ) { return LAMA_LEVEL_CACHE_L1; } else if( 0 == strncmp(param, "L2", strlen("L2")) ) { return LAMA_LEVEL_CACHE_L2; } else if( 0 == strncmp(param, "L3", strlen("L3")) ) { return LAMA_LEVEL_CACHE_L3; } else if( 0 == strncmp(param, "N", strlen("N")) ) { return LAMA_LEVEL_NUMA; } return LAMA_LEVEL_UNKNOWN; } char * lama_type_enum_to_str(rmaps_lama_level_type_t param) { if( LAMA_LEVEL_MACHINE == param ) { return strdup("Machine"); } else if( LAMA_LEVEL_BOARD == param ) { return strdup("Board"); } else if( LAMA_LEVEL_SOCKET == param ) { return strdup("Socket"); } else if( LAMA_LEVEL_CORE == param ) { return strdup("Core"); } else if( LAMA_LEVEL_PU == param ) { return strdup("Hw. Thread"); } else if( LAMA_LEVEL_CACHE_L1 == param ) { return strdup("L1 Cache"); } else if( LAMA_LEVEL_CACHE_L2 == param ) { return strdup("L2 Cache"); } else if( LAMA_LEVEL_CACHE_L3 == param ) { return strdup("L3 Cache"); } else if( LAMA_LEVEL_NUMA == param ) { return strdup("NUMA"); } return strdup("Unknown"); } /********************************* * Core Mapper function *********************************/ static int orte_rmaps_lama_map_core(orte_job_t *jdata) { int ret, exit_status = ORTE_SUCCESS; int cur_app_idx = 0; int num_slots, num_nodes; orte_app_context_t *cur_app_context = NULL; orte_node_t *cur_mach = NULL; orte_node_t **cur_mach_ptr = NULL; orte_proc_t *proc = NULL; opal_list_t *node_list = NULL; opal_list_item_t *item = NULL; opal_tree_t *max_tree = NULL; int *pu_idx_ref = NULL; int *last_pu_idx_ref = NULL; int i, num_mapped, last_num_mapped, mach_level = -1; orte_std_cntr_t j; int max_procs_to_map; int iter_passes; char * last_level_str = NULL; bool initial_map = true; /* * Setup PU reference * Find the position of the 'machine' */ pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts); last_pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts); for( i = 0; i < lama_mapping_num_layouts; ++i ) { pu_idx_ref[i] = 0; last_pu_idx_ref[i] = -1; if( LAMA_LEVEL_MACHINE == lama_mapping_layout[i] ) { mach_level = i; } } /* * Foreach app context */ for(cur_app_idx = 0; cur_app_idx < jdata->apps->size; ++cur_app_idx ) { if( NULL == (cur_app_context = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, cur_app_idx))) { continue; } /* * Get the list of nodes for this app_context. */ node_list = OBJ_NEW(opal_list_t); ret = orte_rmaps_base_get_target_nodes(node_list, &num_slots, cur_app_context, jdata->map->mapping, initial_map, false); if(ORTE_SUCCESS != ret ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } num_nodes = (orte_std_cntr_t)opal_list_get_size(node_list); /* Flag that all subsequent requests should not reset the node->mapped flag */ initial_map = false; /* * If a bookmark exists from some prior mapping, then start from there */ cur_mach = (orte_node_t*)orte_rmaps_base_get_starting_point(node_list, jdata); /* * If the application did not specify the number of procs * then set it to the number of 'slots' * JJH: TODO: Revisit 'max_procs' calculation */ if (0 == cur_app_context->num_procs) { cur_app_context->num_procs = num_slots; } max_procs_to_map = cur_app_context->num_procs; /* * Build the Max Tree */ RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE); max_tree = rmaps_lama_create_empty_max_tree(); if( ORTE_SUCCESS != (ret = rmaps_lama_build_max_tree(jdata, node_list, max_tree, &lama_mppr_max_tree_homogeneous_system)) ) { exit_status = ret; goto cleanup; } RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Mapping: -----------------------"); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_MAPPING); /* * Clear PU reference */ for( i = 0; i < lama_mapping_num_layouts; ++i ) { pu_idx_ref[i] = 0; } /* * Mapping: Recursively loop over all levels */ num_mapped = 0; last_num_mapped = 0; iter_passes = 0; cur_mach_ptr = (orte_node_t**)malloc(sizeof(orte_node_t*)); *cur_mach_ptr = cur_mach; while( max_procs_to_map > num_mapped ) { ret = rmaps_lama_map_core_iter_level(jdata, cur_app_context, node_list, cur_mach_ptr, max_tree, lama_mapping_num_layouts-1, mach_level, &pu_idx_ref, &last_pu_idx_ref, &num_mapped, max_procs_to_map, &iter_passes); if( ORTE_SUCCESS != ret ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * We only get here (without finishing the mapping) if we are going to * start oversubscribing resources. */ if( max_procs_to_map > num_mapped ) { if( !rmaps_lama_can_oversubscribe ) { orte_show_help("help-orte-rmaps-lama.txt", "orte-rmaps-lama:oversubscribe", true, num_mapped, max_procs_to_map); exit_status = ORTE_ERROR; goto cleanup; } else { rmaps_lama_am_oversubscribing = true; } } /* * Check to see if we have made any progress in the mapping loop */ if( 0 < cur_app_idx && 2 == iter_passes ) { /* * Give it another pass: * This is an edge case when we are trying to restart from a * bookmark left by a previous app context. If this app context * is starting from exactly the beginning of the allocation * then the recursive loop could return out here after the * increment pass. This is indicated by (iter_passes = 2). * Since no processes were mapped, we just try again. */ } else if( last_num_mapped == num_mapped ) { orte_show_help("help-orte-rmaps-lama.txt", "orte-rmaps-lama:no-resources-available", true, cur_app_idx, num_mapped, max_procs_to_map, (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); exit_status = ORTE_ERROR; goto cleanup; } else { last_num_mapped = num_mapped; } } /* * Display Bookmark for debugging */ last_level_str = pu_ref_to_str(last_pu_idx_ref, lama_mapping_num_layouts); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Bookmark: --> Node %10s PU %10s", jdata->bookmark->name, last_level_str); free(last_level_str); last_level_str = NULL; /* * Clenup for next iteration */ if( NULL != node_list ) { while(NULL != (item = opal_list_remove_first(node_list))) { OBJ_RELEASE(item); } OBJ_RELEASE(node_list); node_list = NULL; } OBJ_RELEASE(max_tree); max_tree = NULL; } RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_MAPPING); /* * Ordering */ RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_ORDERING); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); if( LAMA_ORDER_SEQ == lama_ordering ) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Ordering: Sequential ------------"); if( ORTE_SUCCESS != (ret = rmaps_lama_ordering_sequential(jdata)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } else { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Ordering: Natural ---------------"); #if 0 /* * We compute our own vpids inline with the algorithm. So no need to use the * orte_rmaps_base_compute_vpids() function. */ #endif } RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_ORDERING); /* * Display Mapping */ if( 10 <= opal_output_get_verbosity(orte_rmaps_base.rmaps_output) ) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); for( j = 0; j < jdata->procs->size; ++j) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { continue; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Ordering: Proc. %2d on Node %10s - Slot %s", proc->name.vpid, proc->node->name, proc->cpu_bitmap); } } /* * All done */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Finished ------------------------"); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); cleanup: if( NULL != node_list ) { while(NULL != (item = opal_list_remove_first(node_list))) { OBJ_RELEASE(item); } OBJ_RELEASE(node_list); node_list = NULL; } if( NULL != max_tree ) { OBJ_RELEASE(max_tree); max_tree = NULL; } if( NULL != pu_idx_ref ) { free(pu_idx_ref); pu_idx_ref = NULL; } if( NULL != last_level_str ) { free(last_level_str); last_level_str = NULL; } return exit_status; } static int rmaps_lama_map_core_iter_level(orte_job_t *jdata, orte_app_context_t *cur_app_context, opal_list_t *node_list, orte_node_t **cur_mach_ptr, opal_tree_t *max_tree, int cur_level, int mach_level, int **pu_idx_ref, int **last_pu_idx_ref, int *num_mapped, int max_procs, int *iter_passes) { int ret, exit_status = ORTE_SUCCESS; int i, j; opal_tree_item_t *tree_for_level = NULL; int max_subtree_arity = 0; char * level_str = NULL; char * last_level_str = NULL; char * slot_list = NULL; orte_proc_t *proc = NULL; int pu_idx = 0; /* * Find the current tree for this level * If it is the machine level, then we need to access the information from * the node list, not the max_tree. */ if( cur_level != mach_level ) { tree_for_level = opal_tree_find_with(opal_tree_get_root(max_tree), &lama_mapping_layout[cur_level]); /* * We do not need subtree, but the arity of the subtree * JJH TODO: This should be an opal_tree function. */ max_subtree_arity = 1; /* include self */ while( NULL != (tree_for_level = opal_tree_get_next_sibling(tree_for_level)) ) { ++max_subtree_arity; } } else if( NULL == *cur_mach_ptr ) { *cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr)); } pu_idx = convert_layer_to_sort_idx(lama_mapping_layout[cur_level]); level_str = lama_type_enum_to_str(lama_mapping_layout[cur_level]); /* * Do we need to advance to a bookmark */ if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { /* * Display last mapped */ last_level_str = pu_ref_to_str(*last_pu_idx_ref, lama_mapping_num_layouts); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Bookmark: --> Last Mapped: Node %10s (bkmrk %10s) PU %10s - Level %2d", (NULL == *cur_mach_ptr ? "(NULL)" : (*cur_mach_ptr)->name), jdata->bookmark->name, last_level_str, (*last_pu_idx_ref)[pu_idx]); free(last_level_str); last_level_str = NULL; /* * Set the level starting point to the last known index */ i = (*last_pu_idx_ref)[pu_idx]; } else { i = 0; } /* * Loop over all siblings at this level * Initial condition above, Increment at bottom, Break check at bottom */ while( 1 ) { /* * Define the PU index */ (*pu_idx_ref)[pu_idx] = i; if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s - Increment only", cur_level+1, level_str, pu_idx, i, max_subtree_arity, (NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name)); } else { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s", cur_level+1, level_str, pu_idx, i, max_subtree_arity, (NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name)); } /* * If not the inner most loop, iterate to the next level down */ if( cur_level > 0 ) { ret = rmaps_lama_map_core_iter_level(jdata, cur_app_context, node_list, cur_mach_ptr, max_tree, cur_level - 1, mach_level, pu_idx_ref, last_pu_idx_ref, num_mapped, max_procs, iter_passes); if( ORTE_SUCCESS != ret ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } /* * If we are restarting the iteration from a previous bookmark then * the first pass through is a no-op mapping pass that just increments * the PU reference. * Called by innermost loop */ else if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { *iter_passes += 1; } /* * Try to map at this location */ else { /* * On first pass, make sure we increment this, just so we do not * accidentally think this is an increment pass. */ if( 0 == *iter_passes ) { *iter_passes += 1; } /* * Display the PU ref for debugging */ display_pu_ref(*pu_idx_ref, lama_mapping_num_layouts, *num_mapped, proc); /* * Check to see if this resource is available on this node. * * In a heterogeneous or otherwise non-uniformly restricted * environment we may iterate to a resource that is not * available either because it does not exist, or is not * available for allocation (off-lined, sub-node allocation). * Additionally, we need to check resource constrains expressed * in the MPPR and binding. */ ret = check_node_availability((*cur_mach_ptr), max_tree, *pu_idx_ref, &slot_list); if( ORTE_SUCCESS != ret || NULL == slot_list ) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:hwtopo: Mapping: --> Level %2d: %s - INVALID/SKIP", cur_level+1, level_str); /* * By not mapping here we just let the iterations continue * until a suitable match is found or we have exhausted all * possible locations to match and thus cannot map any more. */ } else { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Mapping: --> Level %2d: %s - Slot List (%s)", cur_level+1, level_str, slot_list); /* * Map this process onto the resource specified * level_tree_objs[*] and cur_mach point to the specific resource */ proc = NULL; ret = orte_rmaps_lama_map_process(jdata, (*cur_mach_ptr), cur_app_context->idx, &proc); if( ORTE_SUCCESS != ret ) { ORTE_ERROR_LOG(ret); return ret; } /* * Set the binding for this process */ proc->cpu_bitmap = strdup(slot_list); /** JJH: Need to associate with an HWLOC object... hummm.... */ proc->locale = NULL; /* proc->locale = obj; */ /* * Insert the proc into the 'native' ordering location. */ proc->name.vpid = jdata->num_procs; if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } jdata->num_procs += 1; /* * Save a bookmark so we can return here later if necessary */ for( j = 0; j < lama_mapping_num_layouts; ++j ) { (*last_pu_idx_ref)[j] = (*pu_idx_ref)[j]; } jdata->bookmark = (orte_node_t*)(*cur_mach_ptr); (*num_mapped)++; } } /* * Increment loop * * If we are binding, then we may need to advance the binding layer * by more than one. */ if( cur_level != mach_level ) { if( lama_binding_level == lama_mapping_layout[cur_level] ) { i += lama_binding_num_levels; } else { ++i; } } else { /* * Note: Currently we do not allow for 'binding' to multiple machines * But keep the code just in case we want to play with 'stride' later */ if( lama_binding_level == lama_mapping_layout[cur_level] && lama_binding_num_levels > 1) { opal_output(0, "mca:rmaps:lama: ERROR: Cannot bind to multiple machines - SHOULD NEVER HAPPEN: %s", rmaps_lama_cmd_bind); return ORTE_ERROR; #if 0 for( j = 0; j < lama_binding_num_levels; ++j ) { cur_mach = get_next_machine(jdata, node_list, (opal_list_item_t*)cur_mach); if( NULL == cur_mach ) { break; } ++i; } #endif } else { *cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr)); ++i; } } /* * Check if we are done mapping before iterating again */ if( max_procs <= *num_mapped ) { exit_status = ORTE_SUCCESS; goto cleanup; } /* * Check if we are done looping */ if( cur_level != mach_level ) { if( i >= max_subtree_arity ) { break; } } else { if( NULL == *cur_mach_ptr ) { break; } } } /* * Sanity Check: Check if we are done mapping */ if( max_procs <= *num_mapped ) { exit_status = ORTE_SUCCESS; goto cleanup; } cleanup: /* * If the outermost layer, the increment the number of iteration passes. */ if( cur_level == lama_mapping_num_layouts-1 ) { *iter_passes += 1; } if( NULL != level_str ) { free(level_str); level_str = NULL; } if( NULL != slot_list ) { free(slot_list); slot_list = NULL; } return exit_status; } static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list, opal_list_item_t *cur_mach) { orte_node_t *next_mach = NULL; if( NULL == cur_mach ) { next_mach = (orte_node_t*)opal_list_get_first(node_list); } else if( opal_list_get_last(node_list) == cur_mach ) { next_mach = NULL; } else { next_mach = (orte_node_t*)opal_list_get_next(cur_mach); } return next_mach; } static int orte_rmaps_lama_map_process(orte_job_t *jdata, orte_node_t *node, int app_idx, orte_proc_t **proc) { int ret; /* * Add this node to the map, but only once */ if( !node->mapped ) { if (ORTE_SUCCESS > (ret = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(ret); return ret; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } /* * Setup the process object */ if (NULL == (*proc = orte_rmaps_base_setup_proc(jdata, node, app_idx))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } return ORTE_SUCCESS; } static int rmaps_lama_ordering_sequential(orte_job_t *jdata) { orte_job_map_t *map; orte_proc_t *proc = NULL, *swap = NULL; orte_std_cntr_t i, j; int cur_rank = 0; orte_node_t *cur_node = NULL; map = jdata->map; opal_output_verbose(15, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); /* * Assign the ranks sequentially */ for( i = 0; i < map->nodes->size; ++i) { if (NULL == (cur_node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } for( j = 0; j < cur_node->procs->size; ++j) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(cur_node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } opal_output_verbose(15, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Ordering: Rename Proc. %2d to %2d (Rev. %s)", proc->name.vpid, cur_rank, proc->node->name); proc->name.vpid = cur_rank; ++cur_rank; } } /* * Fix the job structure ordering - Sort by new vpid * * If we do not do this then the remote daemons assign the incorrect * ranks to the processes since they use the relative ordering in the * jdata->procs structure to determine vpids locally. * * JJH: Look at combining these loops with the loop in the core so we * JJH: do not have to iterate over the list two times */ opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); cur_rank = 0; for( j = 0; j < jdata->procs->size; ++j) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { continue; } opal_output_verbose(15, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Ordering: Proc. %2d on Node %s", proc->name.vpid, proc->node->name); while((int)proc->name.vpid != cur_rank ) { swap = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid); opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); opal_pointer_array_set_item(jdata->procs, cur_rank, swap); opal_output_verbose(15, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Ordering: \t SWAP Proc. %2d (%d) and Proc. %2d (%d)", proc->name.vpid, cur_rank, swap->name.vpid, proc->name.vpid); proc = swap; } ++cur_rank; } return ORTE_SUCCESS; } static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer) { int i; for(i = 0; i < lama_mapping_num_layouts; ++i ) { if( lama_mapping_layout_sort[i] == layer ) { return i; } } return 0; } static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc) { char *str = NULL; str = pu_ref_to_str(ref, size); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Mapping: PU Ref: %s [Rank %2d] Name: %s", str, rank, (NULL == proc ? "(null)" : ORTE_NAME_PRINT(&proc->name))); free(str); return; } static char * pu_ref_to_str(int *ref, int size) { int i, idx; char *str = NULL; str = (char *)malloc(sizeof(char) * (2 * size)); for(i = 0, idx = 0; i < size; ++i, idx += 2) { sprintf(&(str[idx]), "%2d", ref[i]); } return str; } static int check_node_availability(orte_node_t *cur_node, opal_tree_t *max_tree, int *pu_idx_ref, char **slot_list) { int exit_status = ORTE_SUCCESS; int i; char * level_str = NULL; hwloc_obj_t *topo_child = NULL, *topo_parent=NULL; opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Checking: Node (%s) -------------", cur_node->name); opal_output_verbose(11, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: ---------------------------------"); /* * Determine if the current node has the necessary hardware * as described by the PU index. * Find the hwloc object reference for the resource pointed to * by the PU index. * JJH TODO: If homogeneous system then this could be simplified. */ topo_parent = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); *topo_parent = hwloc_get_obj_by_depth(cur_node->topology, 0, 0); for( i = 0; i < lama_mapping_num_layouts; ++i ) { /* * Skip 'machine' level */ if( LAMA_LEVEL_MACHINE == lama_mapping_layout_sort[i] ) { continue; } /* * Skip 'board' level * JJH: HWLOC does not support BOARD at the moment */ if( LAMA_LEVEL_BOARD == lama_mapping_layout_sort[i] ) { continue; } level_str = lama_type_enum_to_str(lama_mapping_layout_sort[i]); opal_output_verbose(11, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Checking: %2d of %s", pu_idx_ref[i], level_str); /* * Find the nth subtree matching the current key */ topo_child = rmaps_lama_find_nth_subtree_match(cur_node->topology, *topo_parent, pu_idx_ref[i], lama_mapping_layout_sort[i]); /* * If it does not exist, then this node is not capable of matching * so it is unavailable. */ if( NULL == topo_child ) { opal_output_verbose(11, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Check failed: Node %s does not have a %10s %2d", cur_node->name, level_str, pu_idx_ref[i]); exit_status = ORTE_ERROR; goto cleanup; } /* * Keep decending the tree */ topo_parent = topo_child; free(level_str); level_str = NULL; } /* * We have sufficient hardware :) */ /* * Return the native slot list to bind to * Internally checks the MPPR */ *slot_list = get_native_slot_list(cur_node, topo_parent, pu_idx_ref); if( NULL == *slot_list ) { goto cleanup; } cleanup: if( NULL != level_str ) { free(level_str); level_str = NULL; } if( ORTE_SUCCESS != exit_status ) { if( NULL != *slot_list ) { free(*slot_list); *slot_list = NULL; } } return exit_status; } static int rmaps_lama_check_mppr(orte_node_t *node, hwloc_obj_t *child_obj) { int ret; /* * Optimization if no MPPR provided */ if( NULL == lama_mppr_levels ) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: No MPPR to check - Skip..."); return ORTE_SUCCESS; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Check ---------------------------"); /* * Check Parents (excluding self) */ if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, true)) ) { return ret; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Check ---------------------------"); /* * Check Children (including self) */ if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, true)) ) { return ret; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Check ---------------------------"); return ORTE_SUCCESS; } static int rmaps_lama_inc_mppr(orte_node_t *node, hwloc_obj_t *child_obj) { int ret; /* * Optimization if no MPPR provided */ if( NULL == lama_mppr_levels ) { opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: No MPPR to increment - Skip..."); return ORTE_SUCCESS; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Inc ---------------------------"); /* * Increment Parents (excluding self) */ if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, false)) ) { return ret; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Inc ---------------------------"); /* * Increment Children (including self) */ if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, false)) ) { return ret; } opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Inc ---------------------------"); return ORTE_SUCCESS; } static int rmaps_lama_iter_mppr_parents(orte_node_t *node, hwloc_obj_t *child_obj, bool check_only) { rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; rmaps_lama_node_mppr_t *mppr_accounting = NULL; char str[128]; /* * Basecase */ if( NULL == *child_obj ) { return ORTE_SUCCESS; } /* * Check self */ /* * Access MPPR info for this object */ hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata; mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: %s: P [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)", (check_only ? "Checking " : "Increment"), node->index, node->name, str, mppr_accounting->max, (check_only ? mppr_accounting->cur : mppr_accounting->cur + 1), (rmaps_lama_am_oversubscribing ? "T" : "F"), (rmaps_lama_can_oversubscribe ? "T" : "F") ); /* * Check limits - Error on first to exceed */ if( check_only ) { if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) { if( (mppr_accounting->cur)+1 > mppr_accounting->max ) { return ORTE_ERROR; } } } /* * Increment current number allocated below this level */ else { mppr_accounting->cur += 1; } /* * Go to parent */ return rmaps_lama_iter_mppr_parents(node, &((*child_obj)->parent), check_only); } static int rmaps_lama_iter_mppr_children(orte_node_t *node, hwloc_obj_t *child_obj, bool check_only) { int ret; rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; rmaps_lama_node_mppr_t *mppr_accounting = NULL; char str[128]; int i; /* * Check self */ /* * Access MPPR info for this object */ hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata; mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0); opal_output_verbose(5, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: %s: C [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)", (check_only ? "Checking " : "Increment"), node->index, node->name, str, mppr_accounting->max, (check_only ? mppr_accounting->cur : mppr_accounting->cur + 1), (rmaps_lama_am_oversubscribing ? "T" : "F"), (rmaps_lama_can_oversubscribe ? "T" : "F") ); /* * Check limits - Error on first to exceed */ if( check_only ) { if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) { if( (mppr_accounting->cur)+1 > mppr_accounting->max ) { return ORTE_ERROR; } } } /* * Increment current number allocated below this level */ else { mppr_accounting->cur += 1; } /* * Check all children */ for(i = 0; i < (int)(*child_obj)->arity; ++i ) { if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, &((*child_obj)->children[i]), check_only)) ) { return ret; } } return ORTE_SUCCESS; } static char * get_native_slot_list(orte_node_t *cur_node, hwloc_obj_t *pu_obj, int *put_idx_ref) { int i; char *slot_list = NULL; hwloc_obj_t *binding_parent = NULL; hwloc_obj_t *cur_parent = NULL; hwloc_cpuset_t binding_cpuset; hwloc_cpuset_t scratch_cpuset; char *type_str = NULL; /* * Sanity check */ if( NULL == pu_obj ) { return NULL; } /* * Determine the cpumask to send to the backend for binding */ /* * Iterate up the tree until we reach the binding parent */ binding_parent = rmaps_lama_find_parent(cur_node->topology, pu_obj, lama_binding_level); if( NULL == binding_parent ) { return NULL; } /* * Iterate across cousins until we find enough resources or hit the node boundary */ binding_cpuset = hwloc_bitmap_alloc(); hwloc_bitmap_zero(binding_cpuset); scratch_cpuset = hwloc_bitmap_alloc(); cur_parent = binding_parent; for(i = 0; i < lama_binding_num_levels; ++i) { /* * Check MPPR Availability */ if( ORTE_SUCCESS != rmaps_lama_check_mppr(cur_node, cur_parent) ) { goto cleanup; } /* * Accumulate the bitmask * * JJH: TODO: Add resource offline check (?) */ hwloc_bitmap_zero(scratch_cpuset); /* JJH: Maybe use opal_hwloc_base_get_available_cpus(cur_node->topology, (*cur_parent)) ? * They do pretty much the same thing, but with more checks... */ hwloc_bitmap_and(scratch_cpuset, (*cur_parent)->allowed_cpuset, (*cur_parent)->online_cpuset); hwloc_bitmap_or(binding_cpuset, scratch_cpuset, binding_cpuset); #if 0 { hwloc_obj_snprintf(str, sizeof(str), cur_node->topology, *cur_parent, "#", 0); printf("--> BINDING TO -- %-20s \t -- %2d of %2d -- %2d vs %2d\n",str, i, lama_binding_level, (*binding_parent)->logical_index, (*cur_parent)->logical_index); hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->allowed_cpuset ); printf("--> CPU A : %-20s\n", str); hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->online_cpuset ); printf("--> CPU B : %-20s\n", str); hwloc_bitmap_snprintf(str, sizeof(str), scratch_cpuset); printf("--> CPU C : %-20s\n", str); hwloc_bitmap_snprintf(str, sizeof(str), binding_cpuset); printf("--> CPU D : %-20s\n", str); } #endif /* * Iterate to the next cousin. * If we exceed the boundary of the node, then send up an error. */ if( (i+1) < lama_binding_num_levels && NULL == (*cur_parent)->next_cousin ) { type_str = lama_type_enum_to_str(lama_binding_level); opal_output_verbose(10, orte_rmaps_base.rmaps_output, "mca:rmaps:lama: Error: Not able to bind to %*d x %10s - Stopped at %*d", MAX_BIND_DIGIT_LEN, lama_binding_num_levels, type_str, MAX_BIND_DIGIT_LEN, i); free(type_str); type_str = NULL; goto cleanup; } /* * Point to the next cousin */ if( NULL != (*cur_parent)->next_cousin ) { cur_parent = &((*cur_parent)->next_cousin); } } /* * Account for the process placement in the MPPR * Assumes a previous check * We cannot do this in the loop, since if the MPPR check fails we would * need to roll back previous increments. */ cur_parent = binding_parent; for(i = 0; i < lama_binding_num_levels; ++i) { /* * Account for the process placement in the MPPR * Assumes a previous check. */ if( ORTE_SUCCESS != rmaps_lama_inc_mppr(cur_node, cur_parent) ) { goto cleanup; } /* * Point to the next cousin */ if( NULL != (*cur_parent)->next_cousin ) { cur_parent = &((*cur_parent)->next_cousin); } } /* * Convert the cpuset to a slot_list for the remote daemon */ hwloc_bitmap_list_asprintf(&slot_list, binding_cpuset); cleanup: hwloc_bitmap_free(scratch_cpuset); hwloc_bitmap_free(binding_cpuset); return slot_list; } /********************************* * Timer Support *********************************/ static double rmaps_lama_get_time(void) { double wtime; #if OPAL_TIMER_USEC_NATIVE wtime = (double)opal_timer_base_get_usec() / 1000000.0; #else struct timeval tv; gettimeofday(&tv, NULL); wtime = tv.tv_sec; wtime += (double)tv.tv_usec / 1000000.0; #endif return wtime; } static void rmaps_lama_set_time(int idx, bool is_start) { if(idx < RMAPS_LAMA_TIMER_MAX ) { if( is_start ) { timer_start[idx] = rmaps_lama_get_time(); } else { timer_end[idx] = rmaps_lama_get_time(); timer_accum[idx] += timer_end[idx] - timer_start[idx]; } } } static void rmaps_lama_display_all_timers(void) { double diff = 0.0; double total = 0.0; char * label = NULL; opal_output(0, "mca:rmaps:lama: Timing: ---------------------------\n"); /* * Timer: Parse Parameters */ label = strdup("Parse Params"); diff = timer_accum[RMAPS_LAMA_TIMER_PARSE_PARAMS]; rmaps_lama_display_indv_timer_core(diff, label); free(label); total += diff; /* * Timer: Build Max Tree */ label = strdup("Build Max Tree"); diff = timer_accum[RMAPS_LAMA_TIMER_BUILD_MAX_TREE]; rmaps_lama_display_indv_timer_core(diff, label); free(label); total += diff; /* * Timer: Mapping */ label = strdup("Mapping"); diff = timer_accum[RMAPS_LAMA_TIMER_MAPPING]; rmaps_lama_display_indv_timer_core(diff, label); free(label); total += diff; /* * Timer: Ordering */ label = strdup("Ordering"); diff = timer_accum[RMAPS_LAMA_TIMER_ORDERING]; rmaps_lama_display_indv_timer_core(diff, label); free(label); total += diff; /* * Timer: Total Overhead */ label = strdup("Other Overhead"); diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL]; rmaps_lama_display_indv_timer_core(diff - total, label); free(label); /* * Timer: Total */ label = strdup("Total"); diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL]; rmaps_lama_display_indv_timer_core(diff, label); free(label); opal_output(0, "mca:rmaps:lama: ---------------------------------"); } static void rmaps_lama_clear_timers(void) { int i; for(i = 0; i < RMAPS_LAMA_TIMER_MAX; ++i) { timer_start[i] = 0.0; timer_end[i] = 0.0; timer_accum[i] = 0.0; } } static void rmaps_lama_display_indv_timer_core(double diff, char *str) { double perc = 0; double total = 0; total = timer_end[RMAPS_LAMA_TIMER_TOTAL] - timer_start[RMAPS_LAMA_TIMER_TOTAL]; perc = (diff/total) * 100; opal_output(0, "mca:rmaps:lama: \t%-20s = %10.2f ms\t%6.2f %s\n", str, (diff * 1000), perc, "%"); return; }