Update to the latest version provided by Guillaume.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
2017-06-13 22:15:09 -04:00 · 2017-06-13 22:15:09 -04:00 · 2c00c4209a
--- a/ompi/mca/topo/treematch/Makefile.am
+++ b/ompi/mca/topo/treematch/Makefile.am
@ -13,20 +13,25 @@

 if topo_treematch_local
 extra_treematch_files = treematch/tm_bucket.h \
-    treematch/tm_hwloc.h treematch/tm_mapping.h \
+    treematch/tm_mapping.h \
    treematch/tm_timings.h treematch/tm_tree.h \
    treematch/tm_kpartitioning.h treematch/uthash.h\
    treematch/IntConstantInitializedVector.h \
-    treematch/tm_mt.h \
+    treematch/tm_mt.h treematch/fibo.h \
    treematch/tm_thread_pool.h treematch/tm_verbose.h \
-    treematch/tm_malloc.h \
+    treematch/tm_malloc.h treematch/k-partitioning.h\
+    treematch/tm_solution.h treematch/tm_topology.h\
+    treematch/PriorityQueue.h \
    treematch/IntConstantInitializedVector.c \
-    treematch/tm_mt.c \
+    treematch/tm_mt.c treematch/fibo.c \
    treematch/tm_thread_pool.c treematch/tm_verbose.c \
-    treematch/tm_malloc.c \
+    treematch/tm_malloc.c treematch/treematch.h \
    treematch/tm_mapping.c treematch/tm_timings.c \
    treematch/tm_bucket.c treematch/tm_tree.c \
-    treematch/tm_hwloc.c treematch/tm_kpartitioning.c
+    treematch/tm_topology.c treematch/tm_kpartitioning.c \
+    treematch/tm_solution.c  treematch/k-partitioning.c \
+    treematch/PriorityQueue.c
+EXTRA_DIST =  treematch/COPYING treematch/LICENSE
 endif

 sources = \
--- a/ompi/mca/topo/treematch/topo_treematch.h
+++ b/ompi/mca/topo/treematch/topo_treematch.h
@ -70,7 +70,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* module,
                                         int n, const int nodes[],
                                         const int degrees[], const int targets[],
                                         const int weights[],
-                                         struct opal_info_t *info, int reorder,
+                                         struct ompi_info_t *info, int reorder,
                                         ompi_communicator_t **newcomm);
 /*
 * ******************************************************************
--- a/ompi/mca/topo/treematch/topo_treematch_component.c
+++ b/ompi/mca/topo/treematch/topo_treematch_component.c
@ -62,6 +62,9 @@ mca_topo_treematch_component_2_2_0_t mca_topo_treematch_component =

 static int init_query(bool enable_progress_threads, bool enable_mpi_threads)
 {
+    if(NULL == opal_hwloc_topology) {
+        return OPAL_ERR_NOT_SUPPORTED;
+    }
    return OMPI_SUCCESS;
 }

@ -95,3 +98,4 @@ static int mca_topo_treematch_component_register(void)
                                          MCA_BASE_VAR_SCOPE_READONLY, &mca_topo_treematch_component.reorder_mode);
    return OMPI_SUCCESS;
 }
+
--- a/ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c
+++ b/ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c
@ -3,8 +3,8 @@
 * Copyright (c) 2011-2017 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
- * Copyright (c) 2011-2015 INRIA.  All rights reserved.
- * Copyright (c) 2012-2015 Bordeaux Poytechnic Institute
+ * Copyright (c) 2011-2016 INRIA.  All rights reserved.
+ * Copyright (c) 2012-2017 Bordeaux Poytechnic Institute
 * Copyright (c) 2015-2016 Intel, Inc.  All rights reserved.
 * Copyright (c) 2015-2017 Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
@ -25,6 +25,7 @@
 #include "opal/mca/hwloc/hwloc-internal.h"

 #include "ompi/mca/topo/treematch/topo_treematch.h"
+#include "ompi/mca/topo/treematch/treematch/treematch.h"
 #include "ompi/mca/topo/treematch/treematch/tm_mapping.h"
 #include "ompi/mca/topo/base/base.h"

@ -46,6 +47,7 @@

 #define FALLBACK()                  \
    do { free(nodes_roots);         \
+        free(lindex_to_grank);      \
        if( NULL != set) hwloc_bitmap_free(set);     \
        goto fallback; }            \
    while(0);
@ -92,8 +94,8 @@ static void dump_int_array( char* prolog, char* line_prolog, int* array, size_t
    size_t i;

    fprintf(stdout,"%s : ", prolog);
-    for(i = 0; i < num_procs_in_node ; i++)
-        fprintf(stdout,"[$s%i:%i] ", line_prolog, i, array[i]);
+    for(i = 0; i < length ; i++)
+        fprintf(stdout,"%s [%lu:%i] ", line_prolog, i, array[i]);
    fprintf(stdout,"\n");
 }
 static void dump_double_array( char* prolog, char* line_prolog, double* array, size_t length )
@ -101,8 +103,8 @@ static void dump_double_array( char* prolog, char* line_prolog, double* array, s
    size_t i;

    fprintf(stdout,"%s : ", prolog);
-    for(i = 0; i < num_procs_in_node ; i++)
-        fprintf(stdout,"%s [%i:%i] ", line_prolog, i, array[i]);
+    for(i = 0; i < length ; i++)
+        fprintf(stdout,"%s [%lu:%lf] ", line_prolog, i, array[i]);
    fprintf(stdout,"\n");
 }
 #endif
@ -112,7 +114,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
                                         int n, const int nodes[],
                                         const int degrees[], const int targets[],
                                         const int weights[],
-                                         struct opal_info_t *info, int reorder,
+                                         struct ompi_info_t *info, int reorder,
                                         ompi_communicator_t **newcomm)
 {
    int err;
@ -155,6 +157,8 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
    int num_nodes = 0;
    int num_procs_in_node = 0;
    int rank, size;
+    int *k = NULL;
+    int newrank = -1;
    int hwloc_err;
    int oversubscribing_objs = 0, oversubscribed_pus = 0;
    int i, j, idx;
@ -250,6 +254,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
     * all the calls that involve collective communications, so we have to lay the logic
     * accordingly.
     */
+    
    if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
 #ifdef __DEBUG__
        if (0 == rank)
@ -291,6 +296,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
                                                       num_objs_in_node,num_procs_in_node,
                                                       nodes_roots,lindex_to_grank,comm_old);
        }
+      
        if (!oversubscribed_pus) {
            /* Update the data used to compute the correct binding */
            if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
@ -306,17 +312,17 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,

    if( !oversubscribing_objs && !oversubscribed_pus ) {
        if( hwloc_bitmap_isincluded(root_obj->cpuset,set) ) { /* processes are not bound on the machine */
-        obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
-        effective_depth = depth;
-        object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,obj_rank);
-        if( NULL == object) FALLBACK();
+            obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
+            effective_depth = depth;
+            object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,obj_rank);
+            if( NULL == object) FALLBACK();

-        hwloc_bitmap_copy(set,object->cpuset);
-        hwloc_bitmap_singlify(set); /* we don't want the process to move */
-        hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0);
-        if( -1 == hwloc_err) FALLBACK();
+            hwloc_bitmap_copy(set,object->cpuset);
+            hwloc_bitmap_singlify(set); /* we don't want the process to move */
+            hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0);
+            if( -1 == hwloc_err) FALLBACK();
 #ifdef __DEBUG__
-        fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank);
+            fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank);
 #endif
        } else {
 #ifdef __DEBUG__
@ -385,7 +391,6 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
    if (0 == mca_topo_treematch_component.reorder_mode) {
        int *k = NULL;
        int *obj_mapping = NULL;
-        int newrank = -1;
        int num_objs_total = 0;

        /* Gather comm pattern
@ -419,7 +424,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
            tm_topology_t *tm_opt_topology = NULL;
            int *obj_to_rank_in_comm = NULL;
            int *hierarchies = NULL;
-            int  hierarchy[MAX_LEVELS+1];
+            int  hierarchy[TM_MAX_LEVELS+1];
            int  min;

            /* create a table that derives the rank in comm_old from the object number */
@ -489,27 +494,27 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
            free(obj_to_rank_in_comm);

            hierarchy[0] = numlevels;
-            assert(numlevels < MAX_LEVELS);
+            assert(numlevels < TM_MAX_LEVELS);

            for(i = 0 ; i < hierarchy[0]; i++)
                hierarchy[i+1] = tracker[i]->arity;
-            for(; i < (MAX_LEVELS+1); i++)  /* fill up everything else with -1 */
+            for(; i < (TM_MAX_LEVELS+1); i++)  /* fill up everything else with -1 */
                hierarchy[i] = -1;

            if( 0 == rank ) {
-                hierarchies = (int *)malloc(num_nodes*(MAX_LEVELS+1)*sizeof(int));
-                memcpy(hierarchies, hierarchy, (MAX_LEVELS+1)*sizeof(int));
+                hierarchies = (int *)malloc(num_nodes*(TM_MAX_LEVELS+1)*sizeof(int));
+                memcpy(hierarchies, hierarchy, (TM_MAX_LEVELS+1)*sizeof(int));
            }

            /* gather hierarchies iff more than 1 node! */
            if ( num_nodes > 1 ) {
                if( rank != 0 ) {
-                    if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(hierarchy,(MAX_LEVELS+1), MPI_INT, 0,
+                    if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(hierarchy,(TM_MAX_LEVELS+1), MPI_INT, 0,
                                                                 111, MCA_PML_BASE_SEND_STANDARD, comm_old))))
                        ERR_EXIT(err);
                } else {
                    for(i = 1; i < num_nodes ; i++)
-                        if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(hierarchies+i*(MAX_LEVELS+1), (MAX_LEVELS+1), MPI_INT,
+                        if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(hierarchies+i*(TM_MAX_LEVELS+1), (TM_MAX_LEVELS+1), MPI_INT,
                                                                       nodes_roots[i], 111, comm_old, &reqs[i-1])))){
                            free(hierarchies);
                            ERR_EXIT(err);
@ -524,23 +529,25 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
            }

            if ( 0 == rank ) {
-                tree_t *comm_tree = NULL;
+                tm_tree_t *comm_tree = NULL;
+		        tm_solution_t *sol = NULL;
+		        tm_affinity_mat_t *aff_mat = NULL;
                double **comm_pattern = NULL;
-                int *matching = NULL;

 #ifdef __DEBUG__
-                dump_int_array("hierarchies : ", "", hierarchies, num_nodes*(MAX_LEVELS+1));
+                dump_int_array("hierarchies : ", "", hierarchies, num_nodes*(TM_MAX_LEVELS+1));
 #endif
                tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t));
                tm_topology->nb_levels = hierarchies[0];

                /* extract min depth */
                for(i = 1 ; i < num_nodes ; i++)
-                    if (hierarchies[i*(MAX_LEVELS+1)] < tm_topology->nb_levels)
-                        tm_topology->nb_levels = hierarchies[i*(MAX_LEVELS+1)];
+                    if (hierarchies[i*(TM_MAX_LEVELS+1)] < tm_topology->nb_levels)
+                        tm_topology->nb_levels = hierarchies[i*(TM_MAX_LEVELS+1)];
+
                /* Crush levels in hierarchies too long (ie > tm_topology->nb_levels)*/
                for(i = 0; i < num_nodes ; i++) {
-                    int *base_ptr = hierarchies + i*(MAX_LEVELS+1);
+                    int *base_ptr = hierarchies + i*(TM_MAX_LEVELS+1);
                    int  suppl = *base_ptr - tm_topology->nb_levels;
                    for(j = 1 ; j <= suppl ; j++)
                        *(base_ptr + tm_topology->nb_levels) *= *(base_ptr + tm_topology->nb_levels + j);
@ -553,8 +560,8 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
                    for(i = 1; i < tm_topology->nb_levels; i++) {  /* compute the minimum for each level */
                        min = hierarchies[i];
                        for(j = 1; j < num_nodes ; j++)
-                            if( hierarchies[j*(MAX_LEVELS+1) + i] < min)
-                                min = hierarchies[j*(MAX_LEVELS+1) + i];
+                            if( hierarchies[j*(TM_MAX_LEVELS+1) + i] < min)
+                                min = hierarchies[j*(TM_MAX_LEVELS+1) + i];
                        tm_topology->arity[i] = min;
                    }
                } else {
@ -568,24 +575,58 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
                    fprintf(stdout,"topo_arity[%i] = %i\n", i, tm_topology->arity[i]);
 #endif
                /* compute the number of processing elements */
-                tm_topology->nb_nodes = (int *)calloc(tm_topology->nb_levels, sizeof(int));
+                tm_topology->nb_nodes = (size_t *)calloc(tm_topology->nb_levels, sizeof(size_t));
                tm_topology->nb_nodes[0] = 1;
                for(i = 1 ; i < tm_topology->nb_levels; i++)
                    tm_topology->nb_nodes[i] = tm_topology->nb_nodes[i-1] * tm_topology->arity[i-1];

                /* Build process id tab */
                tm_topology->node_id  = (int **)calloc(tm_topology->nb_levels, sizeof(int*));
+		        tm_topology->node_rank = (int **)malloc(sizeof(int *) * tm_topology->nb_levels);
                for(i = 0; i < tm_topology->nb_levels; i++) {
                    tm_topology->node_id[i] = (int *)calloc(tm_topology->nb_nodes[i], sizeof(int));
-                    for (j = 0; j < tm_topology->nb_nodes[i]; j++)
-                        tm_topology->node_id[i][j] = obj_mapping[j];
+		            tm_topology->node_rank[i] = (int * )calloc(tm_topology->nb_nodes[i], sizeof(int));
+		            /*note : we make the hypothesis that logical indexes in hwloc range from
+                      0 to N, are contiguous and crescent.  */
+
+		            for( j = 0 ; j < tm_topology->nb_nodes[i] ; j++ ){
+		                tm_topology->node_id[i][j] = j;
+		                tm_topology->node_rank[i][j] = j;
+		      
+		                /* Should use object->logical_index */
+		                /* obj = hwloc_get_obj_by_depth(topo,i,j%num_objs_in_node);
+                           id = obj->logical_index + (num_objs_in_node)*(j/num_obj_in_node)*/
+		                /*
+		                  int id = core_numbering[j%nb_core_per_nodes] + (nb_core_per_nodes)*(j/nb_core_per_nodes);
+	                      topology->node_id[i][j] = id;
+                                               topology->node_rank[i][id] = j;
+                        */
+                    }
                }
+                /* unused for now*/
+                tm_topology->cost = (double*)calloc(tm_topology->nb_levels,sizeof(double));
+
+                tm_topology->nb_proc_units = num_objs_total;
+		
+                tm_topology->nb_constraints = 0;
+                for(i = 0; i < tm_topology->nb_proc_units ; i++)
+                    if (obj_mapping[i] != -1)
+                        tm_topology->nb_constraints++;
+                tm_topology->constraints = (int *)calloc(tm_topology->nb_constraints,sizeof(int));		
+                for(idx = 0,i = 0; i < tm_topology->nb_proc_units ; i++)
+                    if (obj_mapping[i] != -1)
+                        tm_topology->constraints[idx++] = obj_mapping[i];		
+
+                tm_topology->oversub_fact = 1;
+
 #ifdef __DEBUG__
+                assert(num_objs_total == tm_topology->nb_nodes[tm_topology->nb_levels-1]);
+		
                for(i = 0; i < tm_topology->nb_levels ; i++) {
                    fprintf(stdout,"tm topo node_id for level [%i] : ",i);
                    dump_int_array("", "", obj_mapping, tm_topology->nb_nodes[i]);
                }
-                display_topology(tm_topology);
+                tm_display_topology(tm_topology);
 #endif

                comm_pattern = (double **)malloc(size*sizeof(double *));
@ -600,32 +641,31 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
 #ifdef __DEBUG__
                fprintf(stdout,"==== COMM PATTERN ====\n");
                for( i = 0 ; i < size ; i++) {
-                    dump_double_array("", "", comm_pattern, size);
+                    dump_double_array("", "", comm_pattern[i], size);
                }
 #endif
-                k = (int *)calloc(num_objs_total, sizeof(int));
-                matching = (int *)calloc(size, sizeof(int));
+                tm_optimize_topology(&tm_topology);
+                aff_mat = tm_build_affinity_mat(comm_pattern,size);		  
+                comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
+                sol = tm_compute_mapping(tm_topology, comm_tree);
+
+                k = (int *)calloc(sol->k_length, sizeof(int));
+                for(idx = 0 ; idx < sol->k_length ; idx++)
+                    k[idx] = sol->k[idx][0];

-                tm_opt_topology = optimize_topology(tm_topology);
-                comm_tree = build_tree_from_topology(tm_opt_topology, comm_pattern, size, NULL, NULL);
-                map_topology_simple(tm_opt_topology, comm_tree, matching, size, k);
 #ifdef __DEBUG__
-
                fprintf(stdout,"====> nb levels : %i\n",tm_topology->nb_levels);
                dump_int_array("Rank permutation sigma/k : ", "", k, num_objs_total);
-                dump_int_array("Matching : ", "", matching, size);
+		        assert(size == sol->sigma_length);
+                dump_int_array("Matching : ", "",sol->sigma, sol->sigma_length);
 #endif
-                free(comm_pattern);
-                free(comm_tree);
-                free(matching);
                free(obj_mapping);
-                for(i = 0 ; i < tm_topology->nb_levels ; i++)
-                    free(tm_topology->node_id[i]);
-                free(tm_topology->node_id);
-                free(tm_topology->nb_nodes);
-                free(tm_topology->arity);
-                free(tm_topology);
-                FREE_topology(tm_opt_topology);
+                free(comm_pattern);
+                free(aff_mat->sum_row);
+                free(aff_mat);		
+                tm_free_solution(sol);
+                tm_free_tree(comm_tree);
+                tm_free_topology(tm_topology);
            }
        }

@ -648,15 +688,12 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
        (*newcomm)->c_flags        |= OMPI_COMM_DIST_GRAPH;
        (*newcomm)->c_topo          = topo_module;
        (*newcomm)->c_topo->reorder = reorder;
+
    } else { /* partially distributed reordering */
        ompi_communicator_t *localcomm = NULL;
        int *matching = (int *)calloc(num_procs_in_node,sizeof(int));
        int *lrank_to_grank = (int *)calloc(num_procs_in_node,sizeof(int));
        int *grank_to_lrank = (int *)calloc(size,sizeof(int));
-        hwloc_obj_t object;
-        opal_hwloc_locality_t locality;
-        char set_as_string[64];
-        opal_value_t kv;

        if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, colors[rank], rank,
                                                   &localcomm, false)))
@ -696,8 +733,9 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
        /* The root has now the entire information, so let's crunch it */
        if (rank == lindex_to_grank[0]) {
            tm_topology_t  *tm_topology = NULL;
-            tm_topology_t  *tm_opt_topology = NULL;
-            tree_t *comm_tree = NULL;
+            tm_tree_t *comm_tree = NULL;
+            tm_solution_t *sol = NULL;
+            tm_affinity_mat_t *aff_mat = NULL;
            double **comm_pattern = NULL;

            comm_pattern = (double **)malloc(num_procs_in_node*sizeof(double *));
@ -717,7 +755,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
            fprintf(stdout,"========== COMM PATTERN ============= \n");
            for(i = 0 ; i < num_procs_in_node ; i++){
                fprintf(stdout," %i : ",i);
-                dump_double_array("", "", comm_pattern, num_procs_in_node);
+                dump_double_array("", "", comm_pattern[i], num_procs_in_node);
            }
            fprintf(stdout,"======================= \n");
 #endif
@ -725,92 +763,92 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
            tm_topology  = (tm_topology_t *)malloc(sizeof(tm_topology_t));
            tm_topology->nb_levels = numlevels;
            tm_topology->arity     = (int *)calloc(tm_topology->nb_levels, sizeof(int));
-            tm_topology->nb_nodes  = (int *)calloc(tm_topology->nb_levels, sizeof(int));
+            tm_topology->nb_nodes  = (size_t *)calloc(tm_topology->nb_levels, sizeof(size_t));
            tm_topology->node_id   = (int **)malloc(tm_topology->nb_levels*sizeof(int *));
+            tm_topology->node_rank = (int **)malloc(tm_topology->nb_levels*sizeof(int *));
+	    
            for(i = 0 ; i < tm_topology->nb_levels ; i++){
                int nb_objs = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, tracker[i]->depth);
                tm_topology->nb_nodes[i] = nb_objs;
                tm_topology->arity[i]    = tracker[i]->arity;
-                tm_topology->node_id[i]  = (int*)malloc(sizeof(int)*nb_objs);
-                for(j = 0; j < num_procs_in_node; j++)
-                    tm_topology->node_id[i][j] = localrank_to_objnum[j];
-                for(; j < nb_objs; tm_topology->node_id[i][j] = -1, j++);  /* complete with empty */
+                tm_topology->node_id[i]  = (int *)calloc(tm_topology->nb_nodes[i], sizeof(int));
+                tm_topology->node_rank[i] = (int * )calloc(tm_topology->nb_nodes[i], sizeof(int)); 
+                for(j = 0; j < tm_topology->nb_nodes[i] ; j++){
+                    tm_topology->node_id[i][j] = j;
+                    tm_topology->node_rank[i][j] = j;
+                }
            }

+            /* unused for now*/
+            tm_topology->cost = (double*)calloc(tm_topology->nb_levels,sizeof(double));
+
+            tm_topology->nb_proc_units = num_objs_in_node;
+            //tm_topology->nb_proc_units = num_procs_in_node; 
+            tm_topology->nb_constraints = 0;
+            for(i = 0; i < num_procs_in_node ; i++)
+                if (localrank_to_objnum[i] != -1)
+                    tm_topology->nb_constraints++;
+	    
+            tm_topology->constraints = (int *)calloc(tm_topology->nb_constraints,sizeof(int));
+            for(idx = 0,i = 0; i < num_procs_in_node ; i++)
+                if (localrank_to_objnum[i] != -1)
+                    tm_topology->constraints[idx++] = localrank_to_objnum[i];
+	    
+            tm_topology->oversub_fact = 1;
+	    
 #ifdef __DEBUG__
+            assert(num_objs_in_node == tm_topology->nb_nodes[tm_topology->nb_levels-1]);
            fprintf(stdout,"Levels in topo : %i | num procs in node : %i\n",tm_topology->nb_levels,num_procs_in_node);
            for(i = 0; i < tm_topology->nb_levels ; i++){
                fprintf(stdout,"Nb objs for level %i : %i | arity %i\n ",i,tm_topology->nb_nodes[i],tm_topology->arity[i]);
                dump_int_array("", "Obj id ", tm_topology->node_id[i], tm_topology->nb_nodes[i]);
            }
-            display_topology(tm_topology);
+            tm_display_topology(tm_topology);
 #endif
+            tm_optimize_topology(&tm_topology);
+            aff_mat = tm_build_affinity_mat(comm_pattern,num_procs_in_node);
+            comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
+            sol = tm_compute_mapping(tm_topology, comm_tree);

-            tm_opt_topology = optimize_topology(tm_topology);
-            comm_tree = build_tree_from_topology(tm_opt_topology, comm_pattern, num_procs_in_node, NULL, NULL);
-            map_topology_simple(tm_opt_topology, comm_tree, matching, num_procs_in_node, NULL);
+            k = (int *)calloc(sol->k_length, sizeof(int));
+            for(idx = 0 ; idx < sol->k_length ; idx++)
+                k[idx] = sol->k[idx][0];

 #ifdef __DEBUG__
-            dump_int_array("Matching:", "", matching, num_procs_in_node);
+            fprintf(stdout,"====> nb levels : %i\n",tm_topology->nb_levels);
+            dump_int_array("Rank permutation sigma/k : ", "", k, num_procs_in_node);
+            assert(num_procs_in_node == sol->sigma_length);
+            dump_int_array("Matching : ", "",sol->sigma, sol->sigma_length);
 #endif
+	    
+            free(aff_mat->sum_row);
+            free(aff_mat);
            free(comm_pattern);
-            for(i = 0; i < tm_topology->nb_levels; i++)
-                free(tm_topology->node_id[i]);
-            free(tm_topology->node_id);
-            free(tm_topology->nb_nodes);
-            free(tm_topology->arity);
-            free(tm_topology);
-            FREE_topology(tm_opt_topology);
+            tm_free_solution(sol);
+            tm_free_tree(comm_tree);
+            tm_free_topology(tm_topology);
        }

+        /* Todo : Bcast + group creation */
+        /* scatter the ranks */
        if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_bcast(matching, num_procs_in_node,
-                                                                MPI_INT,0,localcomm,
-                                                                localcomm->c_coll->coll_bcast_module)))
+                                                                 MPI_INT,0,localcomm,
+                                                                 localcomm->c_coll->coll_bcast_module)))
            ERR_EXIT(err);

-        object = hwloc_get_obj_by_depth(opal_hwloc_topology,
-                                        effective_depth, matching[ompi_process_info.my_local_rank]);
-        if( NULL == object) goto fallback;
-        hwloc_bitmap_copy(set, object->cpuset);
-        hwloc_bitmap_singlify(set);
-        err = hwloc_set_cpubind(opal_hwloc_topology,set,0);
-        if( -1 == err) goto fallback;
+        if ( 0 == rank )
+            free(k);

-        /* Report new binding to ORTE/OPAL */
-        /*      hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set);   */
-        err = hwloc_bitmap_snprintf(set_as_string, 64, set);
-
-#ifdef __DEBUG__
-        fprintf(stdout,"Bitmap str size : %i\n", err);
-#endif
-
-        OBJ_CONSTRUCT(&kv, opal_value_t);
-        kv.key = strdup(OPAL_PMIX_CPUSET);
-        kv.type = OPAL_STRING;
-        kv.data.string = strdup(set_as_string);
-
-        (void)opal_pmix.store_local((opal_process_name_t*)OMPI_PROC_MY_NAME, &kv);
-        OBJ_DESTRUCT(&kv);
-
-        locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
-                                                         ompi_process_info.cpuset,set_as_string);
-        OBJ_CONSTRUCT(&kv, opal_value_t);
-        kv.key = strdup(OPAL_PMIX_LOCALITY);
-        kv.type = OPAL_UINT16;
-        kv.data.uint16 = locality;
-        (void)opal_pmix.store_local((opal_process_name_t*)OMPI_PROC_MY_NAME, &kv);
-        OBJ_DESTRUCT(&kv);
-
-        if( OMPI_SUCCESS != (err = ompi_comm_create(comm_old,
-                                                    comm_old->c_local_group,
-                                                    newcomm))) {
+        /* this needs to be optimized but will do for now */
+        if (OMPI_SUCCESS != (err = ompi_comm_split(localcomm, 0, newrank, newcomm, false)))
            ERR_EXIT(err);
-        } else {
-            /* Attach the dist_graph to the newly created communicator */
-            (*newcomm)->c_flags        |= OMPI_COMM_DIST_GRAPH;
-            (*newcomm)->c_topo          = topo_module;
-            (*newcomm)->c_topo->reorder = reorder;
-        }
+        /* end of TODO */
+
+        /* Attach the dist_graph to the newly created communicator */
+        (*newcomm)->c_flags        |= OMPI_COMM_DIST_GRAPH;
+        (*newcomm)->c_topo          = topo_module;
+        (*newcomm)->c_topo->reorder = reorder;
+    	
        free(matching);
        free(grank_to_lrank);
        free(lrank_to_grank);
--- a/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.c
+++ b/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.c
@ -2,13 +2,12 @@
 #include <stdio.h>
 #include "IntConstantInitializedVector.h"

-
 int intCIV_isInitialized(int_CIVector * v, int i)
 {
  if(v->top == 0)
    return 0;
  if(v->from[i] >= 0)
-    if(v->from[i] < v->top && v->to[v->from[i]] == i)
+    if(v->from[i] < v->top && v->to[v->from[i]] == i) 
    return 1;
  return 0;
 }
@ -45,7 +44,7 @@ int intCIV_set(int_CIVector * v, int i, int val)
      v->top++;
    }
  v->vec[i] = val;
-  return 0;
+  return 0;  
 }

 int intCIV_get(int_CIVector * v, int i)
--- a/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.h
+++ b/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.h
@ -12,5 +12,4 @@ void intCIV_exit(int_CIVector * v);
 int intCIV_set(int_CIVector * v, int i, int val);
 int intCIV_get(int_CIVector * v, int i);

-
 #endif /*INTEGER_CONSTANT_INITIALIZED_VECTOR*/
--- a/ompi/mca/topo/treematch/treematch/PriorityQueue.c
+++ b/ompi/mca/topo/treematch/treematch/PriorityQueue.c
@ -0,0 +1,174 @@
+#include <stdlib.h>
+#include "PriorityQueue.h"
+
+/*
+  This comparison function is used to sort elements in key descending order.
+*/
+int compfunc(const FiboNode * const, const FiboNode * const);
+
+
+
+int compFunc(const FiboNode * const node1, const FiboNode * const node2)
+{
+  return 
+    ( ( ((QueueElement*)(node1))->key > ((QueueElement*)(node2))->key ) ? -1 : 1); 
+}
+
+int PQ_init(PriorityQueue * const q, int size)
+{
+  int i;
+  q->size = size;
+  q->elements = malloc(sizeof(QueueElement *) * size);
+  for(i=0; i < size; i++)
+    q->elements[i]=NULL;
+  return fiboTreeInit((FiboTree *)q, compFunc);
+}
+
+void PQ_exit(PriorityQueue * const q)
+{
+  
+  int i;
+  for(i = 0; i < q->size; i++)
+    {
+      if(q->elements[i] != NULL)
+	free(q->elements[i]);
+    }
+  if(q->elements != NULL)
+    free(q->elements);
+  fiboTreeExit((FiboTree *)q);
+}
+void PQ_free(PriorityQueue * const q)
+{
+  int i;
+  for(i = 0; i < q->size; i++)
+    {
+      if(q->elements[i] != NULL)
+	free(q->elements[i]);
+    }
+  fiboTreeFree((FiboTree *)q);
+}
+
+int PQ_isEmpty(PriorityQueue * const q)
+{
+  FiboTree * tree = (FiboTree *)q;
+/* if the tree root is linked to itself then the tree is empty */
+  if(&(tree->rootdat) == (tree->rootdat.linkdat.nextptr)) 
+    return 1;
+  return 0;
+}
+
+void PQ_insertElement(PriorityQueue * const q, QueueElement * const e)
+{
+  if(e->value >= 0 && e->value < q->size)
+    {
+      fiboTreeAdd((FiboTree *)q, (FiboNode *)(e));
+      q->elements[e->value] = e;
+      e->isInQueue = 1;
+    }
+}
+void PQ_deleteElement(PriorityQueue * const q, QueueElement * const e)
+{
+  fiboTreeDel((FiboTree *)q, (FiboNode *)(e));
+  q->elements[e->value] = NULL;
+  e->isInQueue = 0;
+}
+
+void PQ_insert(PriorityQueue * const q, int val, double key)
+{
+  if( val >= 0 && val < q->size)
+    {
+      QueueElement * e = malloc(sizeof(QueueElement));
+      e->value = val;
+      e->key = key;
+      PQ_insertElement(q, e);
+    }
+}
+
+void PQ_delete(PriorityQueue * const q, int val)
+{
+  QueueElement * e = q->elements[val];
+  PQ_deleteElement(q, e);
+  free(e);
+}
+
+QueueElement * PQ_findMaxElement(PriorityQueue * const q)
+{
+  QueueElement * e = (QueueElement *)(fiboTreeMin((FiboTree *)q));
+  return e;
+}
+QueueElement * PQ_deleteMaxElement(PriorityQueue * const q)
+{
+  QueueElement * e = (QueueElement *)(fiboTreeMin((FiboTree *)q));
+  if(e != NULL)
+    {
+      PQ_deleteElement(q, e);
+    }
+  return e;
+}
+
+double PQ_findMaxKey(PriorityQueue * const q)
+{
+  QueueElement * e = PQ_findMaxElement(q);
+  if(e!=NULL)
+    return e->key;
+  return 0;
+}
+
+int PQ_deleteMax(PriorityQueue * const q)
+{
+  QueueElement * e = PQ_deleteMaxElement(q);
+  int res = -1;
+  if(e != NULL)
+    res = e->value;
+  free(e);
+  return res;
+}
+
+void PQ_increaseElementKey(PriorityQueue * const q, QueueElement * const e, double i)
+{
+  if(e->isInQueue)
+    {
+      PQ_deleteElement(q, e);
+      e->key += i;
+      PQ_insertElement(q, e);
+    }
+}
+void PQ_decreaseElementKey(PriorityQueue * const q, QueueElement * const e, double i)
+{
+  if(e->isInQueue)
+    {
+      PQ_deleteElement(q, e);
+      e->key -= i;
+      PQ_insertElement(q, e);
+    }
+}
+void PQ_adjustElementKey(PriorityQueue * const q, QueueElement * const e, double i)
+{
+  if(e->isInQueue)
+    {    
+      PQ_deleteElement(q, e);
+      e->key = i;
+      PQ_insertElement(q, e);
+    }
+}
+
+void PQ_increaseKey(PriorityQueue * const q, int val, double i)
+{
+  QueueElement * e = q->elements[val];
+  if(e != NULL)
+    PQ_increaseElementKey(q, e, i);
+}
+
+void PQ_decreaseKey(PriorityQueue * const q, int val, double i)
+{
+  QueueElement * e = q->elements[val];
+  if(e != NULL)
+    PQ_decreaseElementKey(q, e, i);
+}
+
+void PQ_adjustKey(PriorityQueue * const q, int val, double i)
+{
+  QueueElement * e = q->elements[val];
+  if(e != NULL)
+    PQ_adjustElementKey(q, e, i);
+}
--- a/ompi/mca/topo/treematch/treematch/PriorityQueue.h
+++ b/ompi/mca/topo/treematch/treematch/PriorityQueue.h
@ -0,0 +1,108 @@
+#ifndef PRIORITY_QUEUE
+#define PRIORITY_QUEUE
+
+#include "fibo.h"
+
+/*
+  This is the struct for our elements in a PriorityQueue.
+  The node is at first place so we only have to use a cast to switch between QueueElement's pointer and Fibonode's pointer.
+*/
+typedef struct QueueElement_
+{
+  FiboNode node; /*the node used to insert the element in a FiboTree*/
+  double key; /*the key of the element,  elements are sorted in a descending order according to their key*/
+  int value;
+  int isInQueue;
+} QueueElement;
+
+typedef struct PriorityQueue_
+{
+  FiboTree tree;
+  QueueElement ** elements; /*a vector of element with their value as key so we can easily retreive an element from its value */
+  int size; /*the size allocated to the elements vector*/
+} PriorityQueue;
+
+
+/*
+  PQ_init initiates a PriorityQueue with a size given in argument and sets compFunc as comparison function. Note that you have to allocate memory to the PriorityQueue pointer before calling this function.
+  Returns : 
+    0 if success
+    !0 if failed
+
+  PQ_free simply empties the PriorityQueue but does not free the memory used by its elements.
+  PQ_exit destroys the PriorityQueue without freeing elements. The PriorityQueue is no longer usable without using PQ_init again.
+Note that the PriorityQueue pointer is not deallocated.
+*/
+int PQ_init(PriorityQueue * const, int size);
+void PQ_free(PriorityQueue * const);
+void PQ_exit(PriorityQueue * const);
+
+/*
+  PQ_isEmpty returns 1 if the PriorityQueue is empty, 0 otherwise.
+*/
+int PQ_isEmpty(PriorityQueue * const);
+
+/*
+  PQ_insertElement inserts the given QueueElement in the given PriorityQueue
+*/
+void PQ_insertElement(PriorityQueue * const, QueueElement * const); 
+/*
+  PQ_deleteElement delete the element given in argument from the PriorityQueue.
+*/
+void PQ_deleteElement(PriorityQueue * const, QueueElement * const);
+
+/*
+  PQ_insert inserts an element in the PriorityQueue with the value and key given in argument.
+*/
+void PQ_insert(PriorityQueue * const, int val, double key);
+/*
+  PQ_delete removes the first element found with the value given in argument and frees it.
+*/
+void PQ_delete(PriorityQueue * const, int val);
+
+
+/*
+  PQ_findMaxElement returns the QueueElement with the greatest key in the given PriorityQueue
+*/
+QueueElement * PQ_findMaxElement(PriorityQueue * const);
+/*
+  PQ_deleteMaxElement returns the QueueElement with the geatest key in the given PriorityQueue and removes it from the queue.
+*/
+QueueElement * PQ_deleteMaxElement(PriorityQueue * const);
+
+/*
+  PQ_findMax returns the key of the element with the geatest key in the given PriorityQueue
+*/
+double PQ_findMaxKey(PriorityQueue * const);
+/*
+  PQ_deleteMax returns the value of the element with the greatest key in the given PriorityQueue and removes it from the queue.
+*/
+int PQ_deleteMax(PriorityQueue * const);
+
+/*
+  PQ_increaseElementKey adds the value of i to the key of the given QueueElement
+*/
+void PQ_increaseElementKey(PriorityQueue * const, QueueElement * const, double i);
+/*
+  PQ_decreaseElementKey substracts the value of i from the key of the given QueueElement
+*/
+void PQ_decreaseElementKey(PriorityQueue * const, QueueElement * const, double i);
+/*
+  PQ_adjustElementKey sets to i the key of the given QueueElement.
+*/
+void PQ_adjustElementKey(PriorityQueue * const, QueueElement * const, double i);
+
+/*
+  PQ_increaseKey adds i to the key of the first element found with a value equal to val in the PriorityQueue.
+*/
+void PQ_increaseKey(PriorityQueue * const, int val, double i);
+/*
+  PQ_decreaseKey substracts i from the key of the first element found with a value equal to val in the PriorityQueue.
+*/
+void PQ_decreaseKey(PriorityQueue * const, int val, double i);
+/*
+  PQ_adjustKey sets to i the key of the first element found with a value equal to val in the PriorityQueue.
+*/
+void PQ_adjustKey(PriorityQueue * const, int val, double i);
+
+#endif /*PRIORITY_QUEUE*/
--- a/ompi/mca/topo/treematch/treematch/fibo.c
+++ b/ompi/mca/topo/treematch/treematch/fibo.c
@ -0,0 +1,372 @@
+/* Copyright 2010 IPB, INRIA & CNRS
+**
+** This file originally comes from the Scotch software package for
+** static mapping, graph partitioning and sparse matrix ordering.
+**
+** This software is governed by the CeCILL-B license under French law
+** and abiding by the rules of distribution of free software. You can
+** use, modify and/or redistribute the software under the terms of the
+** CeCILL-B license as circulated by CEA, CNRS and INRIA at the following
+** URL: "http://www.cecill.info".
+** 
+** As a counterpart to the access to the source code and rights to copy,
+** modify and redistribute granted by the license, users are provided
+** only with a limited warranty and the software's author, the holder of
+** the economic rights, and the successive licensors have only limited
+** liability.
+** 
+** In this respect, the user's attention is drawn to the risks associated
+** with loading, using, modifying and/or developing or reproducing the
+** software by the user in light of its specific status of free software,
+** that may mean that it is complicated to manipulate, and that also
+** therefore means that it is reserved for developers and experienced
+** professionals having in-depth computer knowledge. Users are therefore
+** encouraged to load and test the software's suitability as regards
+** their requirements in conditions enabling the security of their
+** systems and/or data to be ensured and, more generally, to use and
+** operate it in the same conditions as regards security.
+** 
+** The fact that you are presently reading this means that you have had
+** knowledge of the CeCILL-B license and that you accept its terms.
+*/
+/************************************************************/
+/**                                                        **/
+/**   NAME       : fibo.c                                  **/
+/**                                                        **/
+/**   AUTHOR     : Francois PELLEGRINI                     **/
+/**                                                        **/
+/**   FUNCTION   : This module handles Fibonacci trees.    **/
+/**                                                        **/
+/**   DATES      : # Version 1.0  : from : 01 may 2010     **/
+/**                                 to     12 may 2010     **/
+/**                                                        **/
+/************************************************************/
+
+/*
+**  The defines and includes.
+*/
+
+#define FIBO
+
+#include <stdlib.h>
+#include <memory.h>
+#include <stdio.h>
+#include "fibo.h"
+
+/* Helper macros which can be redefined at compile time. */
+
+#ifndef INT
+#define INT                         int           /* "long long" can be used on 64-bit systems */
+#endif /* INT */
+
+#ifndef errorPrint
+#define errorPrint(s)               fprintf (stderr, s)
+#endif /* errorPrint */
+
+#ifndef memAlloc
+#define memAlloc                    malloc
+#define memSet                      memset
+#define memFree                     free
+#endif /* memAlloc */
+
+/*********************************************/
+/*                                           */
+/* These routines deal with Fibonacci trees. */
+/*                                           */
+/*********************************************/
+
+/* This routine initializes a Fibonacci
+** tree structure.
+** It returns:
+** - 0   : in case of success.
+** - !0  : on error.
+*/
+
+int
+fiboTreeInit (
+FiboTree * const            treeptr,
+int                      (* cmpfptr) (const FiboNode * const, const FiboNode * const))
+{
+  if ((treeptr->degrtab = (FiboNode **) memAlloc ((sizeof (INT) << 3) * sizeof (FiboNode *))) == NULL) /* As many cells as there are bits in an INT */
+    return (1);
+
+  memSet (treeptr->degrtab, 0, (sizeof (INT) << 3) * sizeof (FiboNode *)); /* Make degree array ready for consolidation: all cells set to NULL */
+
+  treeptr->rootdat.linkdat.prevptr =              /* Link root node to itself */
+  treeptr->rootdat.linkdat.nextptr = &treeptr->rootdat;
+  treeptr->cmpfptr = cmpfptr;
+
+  return (0);
+}
+
+/* This routine flushes the contents of
+** the given Fibonacci tree.
+** It returns:
+** - VOID  : in all cases.
+*/
+
+void
+fiboTreeExit (
+FiboTree * const            treeptr)
+{
+  if (treeptr->degrtab != NULL)
+    memFree (treeptr->degrtab);
+}
+
+/* This routine flushes the contents of
+** the given Fibonacci tree. It does not
+** free any of its contents, but instead
+** makes the tree structure look empty again.
+** It returns:
+** - VOID  : in all cases.
+*/
+
+void
+fiboTreeFree (
+FiboTree * const            treeptr)
+{
+  treeptr->rootdat.linkdat.prevptr =              /* Link root node to itself */
+  treeptr->rootdat.linkdat.nextptr = &treeptr->rootdat;
+}
+
+/* This routine perform the consolidation
+** of roots per degree. It returns the best
+** element found because this element is not
+** recorded in the data structure itself.
+** It returns:
+** - !NULL  : pointer to best element found.
+** - NULL   : Fibonacci tree is empty.
+*/
+
+FiboNode *
+fiboTreeConsolidate (
+FiboTree * const            treeptr)
+{
+  FiboNode ** restrict  degrtab;
+  int                   degrmax;
+  int                   degrval;
+  FiboNode *            rootptr;
+  FiboNode *            nextptr;
+  FiboNode *            bestptr;
+
+  degrtab = treeptr->degrtab;
+
+  for (rootptr = treeptr->rootdat.linkdat.nextptr, nextptr = rootptr->linkdat.nextptr, degrmax = 0; /* For all roots in root list */
+       rootptr != &treeptr->rootdat; ) {
+    degrval = rootptr->deflval >> 1;              /* Get degree, getting rid of flag part */
+#ifdef FIBO_DEBUG
+    if (degrval >= (sizeof (INT) << 3))
+      errorPrint ("fiboTreeConsolidate: invalid node degree");
+#endif /* FIBO_DEBUG */
+    if (degrtab[degrval] == NULL) {               /* If no tree with same degree already found */
+      if (degrval > degrmax)                      /* Record highest degree found               */
+        degrmax = degrval;
+
+      degrtab[degrval] = rootptr;                 /* Record tree as first tree with this degree      */
+      rootptr = nextptr;                          /* Process next root in list during next iteration */
+      nextptr = rootptr->linkdat.nextptr;
+    }
+    else {
+      FiboNode *            oldrptr;              /* Root which will no longer be a root */
+      FiboNode *            chldptr;
+
+      oldrptr = degrtab[degrval];                 /* Assume old root is worse           */
+      if (treeptr->cmpfptr (oldrptr, rootptr) <= 0) { /* If old root is still better    */
+        oldrptr = rootptr;                        /* This root will be be linked to it  */
+        rootptr = degrtab[degrval];               /* We will go on processing this root */
+      }
+
+      degrtab[degrval] = NULL;                    /* Remaining root changes degree so leaves this cell */
+      fiboTreeUnlink (oldrptr);                   /* Old root is no longer a root                      */
+      oldrptr->deflval &= ~1;                     /* Whatever old root flag was, it is reset to 0      */
+      oldrptr->pareptr = rootptr;                 /* Remaining root is now father of old root          */
+
+      chldptr = rootptr->chldptr;                 /* Get first child of remaining root                                    */
+      if (chldptr != NULL) {                      /* If remaining root had already some children, link old root with them */
+        rootptr->deflval += 2;                    /* Increase degree by 1, that is, by 2 with left shift in deflval       */
+        fiboTreeLinkAfter (chldptr, oldrptr);
+      }
+      else {                                      /* Old root becomes first child of remaining root */
+        rootptr->deflval = 2;                     /* Real degree set to 1, and flag set to 0        */
+        rootptr->chldptr = oldrptr;
+        oldrptr->linkdat.prevptr =                /* Chain old root to oneself as only child */
+        oldrptr->linkdat.nextptr = oldrptr;
+      }
+    }                                             /* Process again remaining root as its degree has changed */
+  }
+
+  bestptr = NULL;
+  for (degrval = 0; degrval <= degrmax; degrval ++) {
+    if (degrtab[degrval] != NULL) {               /* If some tree is found           */
+      bestptr = degrtab[degrval];                 /* Record it as potential best     */
+      degrtab[degrval] = NULL;                    /* Clean-up used part of array     */
+      degrval ++;                                 /* Go on at next cell in next loop */
+      break;
+    }
+  }
+  for ( ; degrval <= degrmax; degrval ++) {       /* For remaining roots once a potential best root has been found */
+    if (degrtab[degrval] != NULL) {
+      if (treeptr->cmpfptr (degrtab[degrval], bestptr) < 0) /* If new root is better */
+        bestptr = degrtab[degrval];               /* Record new root as best root    */
+      degrtab[degrval] = NULL;                    /* Clean-up used part of array     */
+    }
+  }
+
+  return (bestptr);
+}
+
+/* This routine returns the node of minimum
+** key in the given tree. The node is searched
+** for each time this routine is called, so this
+** information should be recorded if needed.
+** This is the non-macro version, for testing
+** and setting up breakpoints.
+** It returns:
+** - !NULL  : pointer to best element found.
+** - NULL   : Fibonacci tree is empty.
+*/
+
+#ifndef fiboTreeMin
+
+FiboNode *
+fiboTreeMin (
+FiboTree * const            treeptr)
+{
+  FiboNode *            bestptr;
+
+  bestptr = fiboTreeMinMacro (treeptr);
+
+#ifdef FIBO_DEBUG
+  fiboTreeCheck (treeptr);
+#endif /* FIBO_DEBUG */
+
+  return (bestptr);
+}
+
+#endif /* fiboTreeMin */
+
+/* This routine adds the given node to the
+** given tree. This is the non-macro version,
+** for testing and setting up breakpoints.
+** It returns:
+** - void  : in all cases.
+*/
+
+#ifndef fiboTreeAdd
+
+void
+fiboTreeAdd (
+FiboTree * const            treeptr,
+FiboNode * const            nodeptr)
+{
+  fiboTreeAddMacro (treeptr, nodeptr);
+
+#ifdef FIBO_DEBUG
+  fiboTreeCheck (treeptr);
+#endif /* FIBO_DEBUG */
+}
+
+#endif /* fiboTreeAdd */
+
+/* This routine deletes the given node from
+** the given tree, whatever ths node is (root
+** or non root). This is the non-macro version,
+** for testing and setting up breakpoints.
+** It returns:
+** - void  : in all cases.
+*/
+
+#ifndef fiboTreeDel
+
+void
+fiboTreeDel (
+FiboTree * const            treeptr,
+FiboNode * const            nodeptr)
+{
+  fiboTreeDelMacro (treeptr, nodeptr);
+
+#ifdef FIBO_DEBUG
+  nodeptr->pareptr =
+  nodeptr->chldptr =
+  nodeptr->linkdat.prevptr =
+  nodeptr->linkdat.nextptr = NULL;
+
+  fiboTreeCheck (treeptr);
+#endif /* FIBO_DEBUG */
+}
+
+#endif /* fiboTreeDel */
+
+/* This routine checks the consistency of the
+** given linked list.
+** It returns:
+** - !NULL  : pointer to the vertex.
+** - NULL   : if no such vertex available.
+*/
+
+#ifdef FIBO_DEBUG
+
+static
+int
+fiboTreeCheck2 (
+const FiboNode * const      nodeptr)
+{
+  FiboNode *            chldptr;
+  int                   degrval;
+
+  degrval = 0;
+  chldptr = nodeptr->chldptr;
+  if (chldptr != NULL) {
+    do {
+      if (chldptr->linkdat.nextptr->linkdat.prevptr != chldptr) {
+        errorPrint ("fiboTreeCheck: bad child linked list");
+        return     (1);
+      }
+
+      if (chldptr->pareptr != nodeptr) {
+        errorPrint ("fiboTreeCheck: bad child parent");
+        return (1);
+      }
+
+      if (fiboTreeCheck2 (chldptr) != 0)
+        return (1);
+
+      degrval ++;
+      chldptr = chldptr->linkdat.nextptr;
+    } while (chldptr != nodeptr->chldptr);
+  }
+
+  if (degrval != (nodeptr->deflval >> 1)) {       /* Real node degree is obtained by discarding lowest bit */
+    errorPrint ("fiboTreeCheck2: invalid child information");
+    return     (1);
+  }
+
+  return (0);
+}
+
+int
+fiboTreeCheck (
+const FiboTree * const      treeptr)
+{
+  FiboNode *            nodeptr;
+
+  for (nodeptr = treeptr->rootdat.linkdat.nextptr;
+       nodeptr != &treeptr->rootdat; nodeptr = nodeptr->linkdat.nextptr) {
+    if (nodeptr->linkdat.nextptr->linkdat.prevptr != nodeptr) {
+      errorPrint ("fiboTreeCheck: bad root linked list");
+      return     (1);
+    }
+
+    if (nodeptr->pareptr != NULL) {
+      errorPrint ("fiboTreeCheck: bad root parent");
+      return (1);
+    }
+
+    if (fiboTreeCheck2 (nodeptr) != 0)
+      return (1);
+  }
+
+  return (0);
+}
+
+#endif /* FIBO_DEBUG */
--- a/ompi/mca/topo/treematch/treematch/fibo.h
+++ b/ompi/mca/topo/treematch/treematch/fibo.h
@ -0,0 +1,205 @@
+/* Copyright 2010 IPB, INRIA & CNRS
+**
+** This file originally comes from the Scotch software package for
+** static mapping, graph partitioning and sparse matrix ordering.
+**
+** This software is governed by the CeCILL-B license under French law
+** and abiding by the rules of distribution of free software. You can
+** use, modify and/or redistribute the software under the terms of the
+** CeCILL-B license as circulated by CEA, CNRS and INRIA at the following
+** URL: "http://www.cecill.info".
+** 
+** As a counterpart to the access to the source code and rights to copy,
+** modify and redistribute granted by the license, users are provided
+** only with a limited warranty and the software's author, the holder of
+** the economic rights, and the successive licensors have only limited
+** liability.
+** 
+** In this respect, the user's attention is drawn to the risks associated
+** with loading, using, modifying and/or developing or reproducing the
+** software by the user in light of its specific status of free software,
+** that may mean that it is complicated to manipulate, and that also
+** therefore means that it is reserved for developers and experienced
+** professionals having in-depth computer knowledge. Users are therefore
+** encouraged to load and test the software's suitability as regards
+** their requirements in conditions enabling the security of their
+** systems and/or data to be ensured and, more generally, to use and
+** operate it in the same conditions as regards security.
+** 
+** The fact that you are presently reading this means that you have had
+** knowledge of the CeCILL-B license and that you accept its terms.
+*/
+/************************************************************/
+/**                                                        **/
+/**   NAME       : fibo.h                                  **/
+/**                                                        **/
+/**   AUTHOR     : Francois PELLEGRINI                     **/
+/**                                                        **/
+/**   FUNCTION   : This module contains the definitions of **/
+/**                the generic Fibonacci trees.            **/
+/**                                                        **/
+/**   DATES      : # Version 1.0  : from : 01 may 2010     **/
+/**                                 to     12 may 2010     **/
+/**                                                        **/
+/**   NOTES      : # Since this module has originally been **/
+/**                  designed as a gain keeping data       **/
+/**                  structure for local optimization      **/
+/**                  algorithms, the computation of the    **/
+/**                  best node is only done when actually  **/
+/**                  searching for it.                     **/
+/**                  This is most useful when many         **/
+/**                  insertions and deletions can take     **/
+/**                  place in the mean time. This is why   **/
+/**                  this data structure does not keep     **/
+/**                  track of the best node, unlike most   **/
+/**                  implementations do.                   **/
+/**                                                        **/
+/************************************************************/
+
+/*
+**  The type and structure definitions.
+*/
+
+/* The doubly linked list structure. */
+
+typedef struct FiboLink_ {
+  struct FiboNode_ *        prevptr;              /*+ Pointer to previous sibling element +*/
+  struct FiboNode_ *        nextptr;              /*+ Pointer to next sibling element     +*/
+} FiboLink;
+
+/* The tree node data structure. The deflval
+   variable merges degree and flag variables.
+   The degree of a node is smaller than
+   "bitsizeof (INT)", so it can hold on an
+   "int". The flag value is stored in the
+   lowest bit of the value.                   */
+   
+
+typedef struct FiboNode_ {
+  struct FiboNode_ *        pareptr;              /*+ Pointer to parent element, if any                +*/
+  struct FiboNode_ *        chldptr;              /*+ Pointer to first child element, if any           +*/
+  FiboLink                  linkdat;              /*+ Pointers to sibling elements                     +*/
+  int                       deflval;              /*+ Lowest bit: flag value; other bits: degree value +*/
+} FiboNode;
+
+/* The tree data structure. The fake dummy node aims
+   at handling root node insertion without any test.
+   This is important as many insertions have to be
+   performed.                                        */
+
+typedef struct FiboTree_ {
+  FiboNode                  rootdat;              /*+ Dummy node for fast root insertion                      +*/
+  FiboNode ** restrict      degrtab;              /*+ Consolidation array of size "bitsizeof (INT)"           +*/
+  int                    (* cmpfptr) (const FiboNode * const, const FiboNode * const); /*+ Comparison routine +*/
+} FiboTree;
+
+/*
+**  The marco definitions.
+*/
+
+/* This is the core of the module. All of
+   the algorithms have been de-recursived
+   and written as macros.                 */
+
+#define fiboTreeLinkAfter(o,n)      do {                              \
+                                      FiboNode *        nextptr;      \
+                                      nextptr = (o)->linkdat.nextptr; \
+                                      (n)->linkdat.nextptr = nextptr; \
+                                      (n)->linkdat.prevptr = (o);     \
+                                      nextptr->linkdat.prevptr = (n); \
+                                      (o)->linkdat.nextptr = (n);     \
+                                    } while (0)
+
+#define fiboTreeUnlink(n)           do {                                                            \
+                                      (n)->linkdat.prevptr->linkdat.nextptr = (n)->linkdat.nextptr; \
+                                      (n)->linkdat.nextptr->linkdat.prevptr = (n)->linkdat.prevptr; \
+                                    } while (0)
+
+#define fiboTreeAddMacro(t,n)       do {                                        \
+                                      (n)->pareptr = NULL;                      \
+                                      (n)->chldptr = NULL;                      \
+                                      (n)->deflval = 0;                         \
+                                      fiboTreeLinkAfter (&((t)->rootdat), (n)); \
+  } while (0)
+
+#define fiboTreeMinMacro(t)         (fiboTreeConsolidate (t))
+
+#define fiboTreeCutChildren(t,n)    do {                                                \
+                                      FiboNode *        chldptr;                        \
+                                      chldptr = (n)->chldptr;                           \
+                                      if (chldptr != NULL) {                            \
+                                        FiboNode *        cendptr;                      \
+                                        cendptr = chldptr;                              \
+                                        do {                                            \
+                                          FiboNode *        nextptr;                    \
+                                          nextptr = chldptr->linkdat.nextptr;           \
+                                          chldptr->pareptr = NULL;                      \
+                                          fiboTreeLinkAfter (&((t)->rootdat), chldptr); \
+                                          chldptr = nextptr;                            \
+                                        } while (chldptr != cendptr);                   \
+                                      }                                                 \
+                                    } while (0)
+
+#define fiboTreeDelMacro(t,n)       do {                                                    \
+                                      FiboNode *        pareptr;                            \
+                                      FiboNode *        rghtptr;                            \
+                                      pareptr = (n)->pareptr;                               \
+                                      fiboTreeUnlink (n);                                   \
+                                      fiboTreeCutChildren ((t), (n));                       \
+                                      if (pareptr == NULL)                                  \
+                                        break;                                              \
+                                      rghtptr = (n)->linkdat.nextptr;                       \
+                                      while (1) {                                           \
+                                        FiboNode *        gdpaptr;                          \
+                                        int               deflval;                          \
+                                        deflval = pareptr->deflval - 2;                     \
+                                        pareptr->deflval = deflval | 1;                     \
+                                        gdpaptr = pareptr->pareptr;                         \
+                                        pareptr->chldptr = (deflval <= 1) ? NULL : rghtptr; \
+                                        if (((deflval & 1) == 0) || (gdpaptr == NULL))      \
+                                          break;                                            \
+                                        rghtptr = pareptr->linkdat.nextptr;                 \
+                                        fiboTreeUnlink (pareptr);                           \
+                                        pareptr->pareptr = NULL;                            \
+                                        fiboTreeLinkAfter (&((t)->rootdat), pareptr);       \
+                                        pareptr = gdpaptr;                                  \
+                                      }                                                     \
+                                    } while (0)
+
+/*
+**  The function prototypes.
+*/
+
+/* This set of definitions allows the user
+   to specify whether he prefers to use
+   the fibonacci routines as macros or as
+   regular functions, for instance for
+   debugging.                             */
+
+#define fiboTreeAdd                 fiboTreeAddMacro
+/* #define fiboTreeDel              fiboTreeDelMacro */
+/* #define fiboTreeMin              fiboTreeMinMacro */
+
+#ifndef FIBO
+#define static
+#endif
+
+int                         fiboTreeInit        (FiboTree * const, int (*) (const FiboNode * const, const FiboNode * const));
+void                        fiboTreeExit        (FiboTree * const);
+void                        fiboTreeFree        (FiboTree * const);
+FiboNode *                  fiboTreeConsolidate (FiboTree * const);
+#ifndef fiboTreeAdd
+void                        fiboTreeAdd         (FiboTree * const, FiboNode * const);
+#endif /* fiboTreeAdd */
+#ifndef fiboTreeDel
+void                        fiboTreeDel         (FiboTree * const, FiboNode * const);
+#endif /* fiboTreeDel */
+#ifndef fiboTreeMin
+FiboNode *                  fiboTreeMin         (FiboTree * const);
+#endif /* fiboTreeMin */
+#ifdef FIBO_DEBUG
+int                         fiboTreeCheck       (const FiboTree * const);
+static int                  fiboTreeCheck2      (const FiboNode * const);
+#endif /* FIBO_DEBUG */
+
+#undef static
--- a/ompi/mca/topo/treematch/treematch/k-partitioning.c
+++ b/ompi/mca/topo/treematch/treematch/k-partitioning.c
@ -0,0 +1,339 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "k-partitioning.h"
+#include "tm_mt.h"
+#include "tm_verbose.h"
+
+void memory_allocation(PriorityQueue ** Q, PriorityQueue ** Qinst, double *** D, int n, int k);
+void initialization(int * const part, double ** const matrice, PriorityQueue * const Qpart, PriorityQueue * const Q, PriorityQueue * const Qinst, double ** const D, int n, int k, int * const deficit, int * const surplus);
+void algo(int * const part, double ** const matrice, PriorityQueue * const Qpart, PriorityQueue * const Q, PriorityQueue * const Qinst, double ** const D, int n,  int * const deficit, int * const surplus);
+double nextGain(PriorityQueue * const Qpart, PriorityQueue * const Q, int * const deficit, int * const surplus);
+void balancing(int n, int deficit, int surplus, double ** const D, int * const part);
+void destruction(PriorityQueue * Qpart, PriorityQueue * Q, PriorityQueue * Qinst, double ** D, int n, int k);
+
+void allocate_vertex2(int u, int *res, double **comm, int n, int *size, int max_size);
+double eval_cost2(int *,int,double **);
+int  *kpartition_greedy2(int k, double **comm, int n, int nb_try_max, int *constraints, int nb_constraints);
+int*  build_p_vector(double **comm, int n, int k, int greedy_trials, int * constraints, int nb_constraints);
+
+int* kPartitioning(double ** comm, int n, int k, int * constraints, int nb_constraints, int greedy_trials)
+{
+  /* ##### declarations & allocations ##### */
+
+  PriorityQueue Qpart, *Q = NULL, *Qinst = NULL;
+  double **D = NULL;
+  int deficit, surplus, *part = NULL;
+  int real_n = n-nb_constraints;
+
+  part = build_p_vector(comm, n, k, greedy_trials, constraints, nb_constraints);
+
+  memory_allocation(&Q, &Qinst, &D, real_n, k);
+
+  /* ##### Initialization ##### */
+
+  initialization(part, comm, &Qpart, Q, Qinst, D, real_n, k, &deficit, &surplus);
+
+  /* ##### Main loop ##### */
+  while((nextGain(&Qpart, Q, &deficit, &surplus))>0)
+    {
+      algo(part, comm, &Qpart, Q, Qinst, D, real_n, &deficit, &surplus);
+    }
+
+  /* ##### Balancing the partition  ##### */
+  balancing(real_n, deficit, surplus, D, part); /*if partition isn't balanced we have to make one last move*/
+
+  /* ##### Memory deallocation ##### */
+  destruction(&Qpart, Q, Qinst, D, real_n, k);
+
+  return part;
+}
+
+void memory_allocation(PriorityQueue ** Q, PriorityQueue ** Qinst, double *** D, int n, int k)
+{
+  int i;
+  *Q = calloc(k, sizeof(PriorityQueue)); /*one Q for each partition*/
+  *Qinst = calloc(n, sizeof(PriorityQueue)); /*one Qinst for each vertex*/
+  *D = malloc(sizeof(double *) * n); /*D's size is n * k*/
+  for(i=0; i < n; ++i)
+    (*D)[i] = calloc(k, sizeof(double));
+}
+
+void initialization(int * const part, double ** const matrice, PriorityQueue * const Qpart, PriorityQueue * const Q, PriorityQueue * const Qinst, double ** const D, int n, int k, int * const deficit, int * const surplus)
+{
+  int i,j;
+
+  /* ##### PriorityQueue initializations ##### */
+  /* We initialize Qpart with a size of k because it contains the subsets's indexes. */
+  PQ_init(Qpart, k);
+
+  /* We initialize each Q[i] with a size of n because each vertex is in one of these queue at any time. */
+  /* However we could set a size of (n/k)+1 as this is the maximum size of a subset when the partition is not balanced. */
+  for(i=0; i<k; ++i)
+    PQ_init(&Q[i], n);
+
+  /* We initialize each Qinst[i] with a size of k because fo each vertex i, Qinst[i] contains the D(i,j) values for j = 0...(k-1) */
+  for(i=0; i<n; ++i)
+    PQ_init(&Qinst[i], k);
+
+  /* ##### Computing the D(i,j) values ##### */
+  for(i=0; i < n; ++i) /*for each vertex i*/
+    {
+      for(j=0; j < n; ++j) /*and for each vertex j*/
+	{
+	  D[i][part[j]] += matrice[i][j];
+	}
+    }
+
+  /* ##### Filling up the queues ##### */
+  /* ### Qinst ### */
+  for(i=0; i < n; ++i) /*for each vertex i*/
+    for(j=0; j < k; ++j) /*and for each subset j*/
+      PQ_insert(&Qinst[i], j, D[i][j]); /*we insert the corresponding D(i,j) value in Qinst[i]*/
+
+  /* ### Q ### */
+  for(i=0; i<n; ++i) /*for each vertex i*/
+    PQ_insert(&Q[part[i]], i, PQ_findMaxKey(&Qinst[i])-D[i][part[i]]); /*we insert in Q[part[i]] the vertex i with its highest possible gain*/
+
+  /* ### Qpart ### */
+  for(i=0; i < k; ++i) /*for each subset i*/
+    PQ_insert(Qpart, i, PQ_findMaxKey(&Q[i])); /*we insert it in Qpart with the highest possible gain by one of its vertex as key*/
+
+
+  /* ##### Initialization of deficit/surplus ##### */
+  *surplus = *deficit = 0;
+}
+
+void algo(int * const part, double ** const matrice, PriorityQueue * const Qpart, PriorityQueue * const Q, PriorityQueue * const Qinst, double ** const D, int n, int * const deficit, int * const surplus)
+{
+  int p,u,v,j;
+  double d;
+  if(*deficit == *surplus) /*if the current partition is balanced*/
+    {
+      p = PQ_deleteMax(Qpart); /*we get the subset with the highest possible gain in p and remove it from Qpart*/
+      u = PQ_deleteMax(&Q[p]); /*then we get the vertex with this highest possible gain in u and remove it from Q[p] */
+      *deficit = part[u]; /*p becomes the deficit */
+    }
+  else /*the current partition is not balanced*/
+    {
+      u = PQ_deleteMax(&Q[*surplus]); /*we get the vertex with the highest possible gain in surplus and remove it from Q[surplus] */
+      PQ_delete(Qpart, part[u]); /*then we remove surplus from Qpart  (note that u is from surplus so part[u] is surplus) */
+    }
+  d = PQ_findMaxKey(&Q[part[u]]); /*we get the next highest possible gain in part[u] (without taking u in account as we already removed it from Q[part[u])*/
+  PQ_insert(Qpart, part[u], d); /*we put part[u] back in Qpart with its new highest possible gain*/
+  j = PQ_deleteMax(&Qinst[u]); /*we get from Qinst[u] the subset in which we have to move u to get the highest gain.*/
+  if ( j < 0){
+    if(tm_get_verbose_level() >= CRITICAL)
+      fprintf(stderr,"Error Max element in priority queue negative!\n");
+    exit(-1);
+  }
+  *surplus = j; /*this subset becomes surplus*/
+
+  for(v=0; v < n; ++v) /*we scan though all edges (u,v) */
+    {
+      j = part[u]; /*we set j to the starting subset */
+      D[v][j]= D[v][j] - matrice[u][v]; /*we compute the new D[v, i] (here j has the value of the starting subset of u, that's why we say i) */
+      PQ_adjustKey(&Qinst[v], j, D[v][j]); /*we update this gain in Qinst[v]*/
+      j = *surplus; /*we put back the arrival subset in j*/
+      D[v][j] = D[v][j] + matrice[u][v]; /*matrice[u][v]; we compute the new D[v, j]*/
+      PQ_adjustKey(&Qinst[v], j, D[v][j]);/*we update this gain in Qinst[v]*/
+      d = PQ_findMaxKey(&Qinst[v]) - D[v][part[v]]; /*we compute v's new highest possible gain*/
+      PQ_adjustKey(&Q[part[v]], v, d); /*we update it in Q[p[v]]*/
+      d = PQ_findMaxKey(&Q[part[v]]); /*we get the highest possible gain in v's subset*/
+      PQ_adjustKey(Qpart, part[v], d); /*we update it in Qpart*/
+    }
+  part[u] = *surplus; /*we move u from i to j (here surplus has the value of j the arrival subset)*/
+
+  d = PQ_findMaxKey(&Qinst[u]) - D[u][part[u]]; /*we compute the new u's highest possible gain*/
+  if(!PQ_isEmpty(&Qinst[u])) /*if at least one more move of u is possible*/
+    PQ_insert(&Q[part[u]], u, d); /*we insert u in the Q queue of its new subset*/
+  PQ_adjustKey(Qpart, part[u], d); /*we update the new highest possible gain in u's subset*/
+}
+
+double nextGain(PriorityQueue * const Qpart, PriorityQueue * const Q, int * const deficit, int * const surplus)
+{
+  double res;
+  if(*deficit == *surplus) /*if the current partition is balanced*/
+    res = PQ_findMaxKey(Qpart); /*we get the highest possible gain*/
+  else /*the current partition is not balanced*/
+    res = PQ_findMaxKey(&Q[*surplus]); /*we get the highest possible gain from surplus*/
+  return res;
+}
+
+void balancing(int n, int deficit, int surplus, double ** const D, int * const part)
+{
+  if(surplus != deficit) /*if the current partition is not balanced*/
+    {
+      int i;
+      PriorityQueue moves; /*we use a queue to store the possible moves from surplus to deficit*/
+      PQ_init(&moves, n);
+      for(i=0; i<n; ++i) /*for each vertex*/
+	{
+	  if(part[i] == surplus) /*if i is from surplus*/
+	    PQ_insert(&moves, i, D[i][deficit]-D[i][surplus]); /*we insert i in moves with the gain we get from moving i from surplus to deficit as key */
+	}
+      part[PQ_deleteMax(&moves)] = deficit; /*we put the i from moves with the highest gain in deficit*/
+      PQ_exit(&moves);
+    }
+}
+
+void destruction(PriorityQueue * Qpart, PriorityQueue * Q, PriorityQueue * Qinst, double ** D, int n, int k)
+{
+  int i;
+  PQ_exit(Qpart);
+  for(i=0; i<k; ++i)
+    PQ_exit(&Q[i]);
+  free(Q);
+  for(i=0; i<n; ++i)
+    {
+      PQ_exit(&Qinst[i]);
+    }
+  free(Qinst);
+
+  for(i=0; i<n; ++i)
+    free(D[i]);
+  free(D);
+}
+
+
+int  *kpartition_greedy2(int k, double **comm, int n, int nb_try_max, int *constraints, int nb_constraints)
+{
+  int *res = NULL, *best_res=NULL, *size = NULL;
+  int i,j,nb_trials;
+  int max_size;
+  double cost, best_cost = -1;
+
+  for( nb_trials = 0 ; nb_trials < nb_try_max ; nb_trials++ ){
+    res = (int *)malloc(sizeof(int)*n);
+    for ( i = 0 ; i < n ; ++i )
+      res[i] = -1;
+
+    size = (int *)calloc(k,sizeof(int));
+    max_size = n/k;
+
+    /* put "dumb" vertices in the correct partition if there are any*/
+    if (nb_constraints){ /*if there are at least one constraint*/
+      int nb_real_nodes = n-nb_constraints; /*this is the number of "real" nodes by opposition to the dumb ones*/
+      for(i=0; i<nb_constraints; ++i) /*for each constraint*/
+	{
+	  int i_part = constraints[i]/max_size; /*we compute its partition*/
+	  res[nb_real_nodes+i] = i_part; /*and we set it in partition vector*/
+	  size[i_part]++; /*we update the partition's size*/
+	}
+    }
+
+    /* choose k initial "true" vertices at random and put them in a different partition */
+    for ( i = 0 ; i < k ; ++i ){
+      /* if the partition is full of dumb vertices go to next partition*/
+      if(size[i] >= max_size)
+	continue;
+      /* find a vertex not already partitionned*/
+      do{
+	/* call the mersenne twister PRNG of tm_mt.c*/
+	j =  genrand_int32() % n;
+      } while ( res[j] != -1 );
+      /* allocate and update size of partition*/
+      res[j] = i;
+      /* printf("random: %d -> %d\n",j,i); */
+      size[i]++;
+    }
+
+    /* allocate each unallocated vertices in the partition that maximize the communication*/
+    for( i = 0 ;  i < n ; ++i )
+      if( res[i] == -1)
+	allocate_vertex2(i, res, comm, n-nb_constraints, size, max_size);
+
+    cost = eval_cost2(res,n-nb_constraints,comm);
+    /*print_1D_tab(res,n);
+    printf("cost=%.2f\n",cost);*/
+    if((cost<best_cost) || (best_cost == -1)){
+      best_cost=cost;
+      free(best_res);
+      best_res=res;
+    }else
+      free(res);
+
+    free(size);
+  }
+
+  /*print_1D_tab(best_res,n);
+  printf("best_cost=%.2f\n",best_cost);
+  */
+  return best_res;
+}
+
+void allocate_vertex2(int u, int *res, double **comm, int n, int *size, int max_size)
+{
+  int i,best_part = -1;
+  double cost, best_cost = -1;
+
+  /*printf("\n");
+    print_1D_tab(res,n);*/
+  for( i = 0 ; i < n ; ++i){
+    if (( res[i] != -1 ) && ( size[res[i]] < max_size )){
+      cost = comm[u][i];
+      if (( cost > best_cost)){
+	best_cost = cost;
+	best_part = res[i];
+      }
+    }
+  }
+
+  /*  printf("size[%d]: %d\n",best_part, size[best_part]);*/
+  /* printf("putting(%.2f): %d -> %d\n",best_cost, u, best_part); */
+
+  res[u] = best_part;
+  size[best_part]++;
+}
+
+double eval_cost2(int *partition, int n, double **comm)
+{
+  double cost = 0;
+  int i,j;
+
+  for( i = 0 ; i < n ; ++i )
+    for( j = i+1 ; j < n ; ++j )
+      if(partition[i] != partition[j])
+	cost += comm[i][j];
+
+  return cost;
+}
+
+int* build_p_vector(double **comm, int n, int k, int greedy_trials, int * constraints, int nb_constraints)
+{
+  int * part = NULL;
+  if(greedy_trials>0) /*if greedy_trials > 0 then we use kpartition_greedy with greedy_trials trials*/
+    {
+      part = kpartition_greedy2(k, comm, n, greedy_trials, constraints, nb_constraints);
+    }
+  else
+    {
+      int * size = calloc(k, sizeof(int));
+      int i,j;
+      int nodes_per_part = n/k;
+      int nb_real_nodes = n-nb_constraints;
+      part = malloc(sizeof(int) * n);
+      for(i=0; i<nb_constraints; i++) /*for each constraints*/
+	{
+	  int i_part = constraints[i]/nodes_per_part; /*we compute the partition where we have to put this constraint*/
+	  part[nb_real_nodes+i] = i_part;
+	  size[i_part]++;
+	}
+      j=0;
+      /* now we have to fill the partitions with the "real" nodes */
+      for(i=0; i<nb_real_nodes; i++) /*for each node*/
+	{
+	  if(size[j] < nodes_per_part) /*if j partition isn't full*/
+	    {
+	      size[j]++;
+	      part[i] = j; /*then we put the node in this part*/
+	    }
+	  else /*otherwise we decrement i to get the same node in the next loop*/
+	    {
+	      i--;
+	    }
+	  j = (j+1)%k; /*and we change j to the next partition*/
+	}
+      free(size);
+    }
+  return part;
+}
--- a/ompi/mca/topo/treematch/treematch/k-partitioning.h
+++ b/ompi/mca/topo/treematch/treematch/k-partitioning.h
@ -0,0 +1,20 @@
+#ifndef K_PARTITIONING
+#define K_PARTITIONING
+
+#include "PriorityQueue.h"
+
+/*
+  kPartitioning : function to call the k-partitioning algorithm
+      - comm : the communication matrix
+      - n : the number of vertices (including dumb  vertices)
+      - k : the number of partitions
+      - constraints : the list of constraints
+      - nb_constraints : the number of constraints
+      - greedy_trials : the number of trials to build the partition vector with kpartition_greedy
+          - 0 : cyclic distribution of vertices
+	  - > 0 : use of kpartition_greedy with greedy_trials number of trials 
+ */
+             
+int* kPartitioning(double ** comm, int n, int k, int * const constraints, int nb_constraints, int greedy_trials);
+
+#endif /*K_PARTITIONING*/
--- a/ompi/mca/topo/treematch/treematch/tgt_map.c
+++ b/ompi/mca/topo/treematch/treematch/tgt_map.c
@ -1,56 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-//#include "tm_hwloc.h"
-#include "tm_tree.h"
-#include "tm_mapping.h"
-#include "tm_timings.h"
-
-
-
-int  main(int argc, char**argv){;
-  tree_t *comm_tree=NULL;
-  double **comm,**arch;
-  tm_topology_t *topology;
-  int nb_processes,nb_cores;
-  int *sol,*k;
-  if(argc<3){
-    fprintf(stderr,"Usage: %s <Architecture tgt> <communication partern file>\n",argv[0]);
-    return -1;
-  }
-
-  topology=tgt_to_tm(argv[1],&arch);
-   optimize_topology(&topology);
-  nb_processes=build_comm(argv[2],&comm);
-  sol=(int*)MALLOC(sizeof(int)*nb_processes);
-
-  nb_cores=nb_processing_units(topology);
-  k=(int*)MALLOC(sizeof(int)*nb_cores);
-  // TreeMatchMapping(nb_processes,nb_cores,comm,sol);
-
-  if(nb_processes>nb_cores){
-    fprintf(stderr,"Error: to many processes (%d)  for this topology (%d nodes)\n",nb_processes,nb_cores);
-    exit(-1);
-  }
-  TIC;
-  comm_tree=build_tree_from_topology(topology,comm,nb_processes,NULL,NULL);
-  map_topology_simple(topology,comm_tree,sol,k);
-  double duration=TOC;
-  printf("mapping duration: %f\n",duration);
-  printf("TreeMatch: ");
-  print_sol_inv(nb_processes,sol,comm,arch);
-  //print_1D_tab(k,nb_cores);
-//  display_other_heuristics(topology,nb_processes,comm,arch);
-
-  //display_tab(arch,nb_cores);
-
-  FREE_topology(topology);
-  //FREE_tree(comm_tree);
-  FREE(sol);
-  FREE(comm);
-  FREE(arch);
-
-
-
-  return 0;
-}
--- a/ompi/mca/topo/treematch/treematch/tgt_to_mat.c
+++ b/ompi/mca/topo/treematch/treematch/tgt_to_mat.c
@ -1,31 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include "tm_hwloc.h"
-#include "tm_tree.h"
-#include "tm_mapping.h"
-#include "tm_timings.h"
-
-
-
-int  main(int argc, char**argv){;
-  tm_topology_t *topology;
-  int nb_cores;
-  double **arch;
-  if(argc<2){
-    fprintf(stderr,"Usage: %s <Architecture tgt>\n",argv[0]);
-    return -1;
-  }
-
-  topology=tgt_to_tm(argv[1],&arch);
-  nb_cores=nb_nodes(topology);
-
-  display_tab(arch,nb_cores);
-
-  FREE_topology(topology);
-  FREE(arch);
-
-
-
-  return 0;
-}
--- a/ompi/mca/topo/treematch/treematch/tm_bucket.c
+++ b/ompi/mca/topo/treematch/treematch/tm_bucket.c
@ -31,7 +31,7 @@ static int ilog2(int val)

 static int verbose_level = ERROR;

-bucket_list_t global_bl = {0};
+bucket_list_t global_bl;

 int tab_cmp(const void*,const void*);
 int old_bucket_id(int,int,bucket_list_t);
@ -47,12 +47,12 @@ void fill_buckets(bucket_list_t);
 int is_power_of_2(int);
 void partial_sort(bucket_list_t *,double **,int);
 void next_bucket_elem(bucket_list_t,int *,int *);
-int add_edge_3(tree_t *,tree_t *,int,int,int *);
-void FREE_bucket(bucket_t *);
-void FREE_tab_bucket(bucket_t **,int);
-void FREE_bucket_list(bucket_list_t);
-void partial_update_val (int nb_args, void **args);
-
+int add_edge_3(tm_tree_t *,tm_tree_t *,int,int,int *);
+void free_bucket(bucket_t *);
+void free_tab_bucket(bucket_t **,int);
+void free_bucket_list(bucket_list_t);
+void partial_update_val (int nb_args, void **args, int thread_id);
+double bucket_grouping(tm_affinity_mat_t *,tm_tree_t *, tm_tree_t *, int ,int);
 int tab_cmp(const void* x1,const void* x2)
 {
  int *e1 = NULL,*e2 = NULL,i1,i2,j1,j2;
@ -146,7 +146,7 @@ void check_bucket(bucket_t *b,double **tab,double inf, double sup)
    j = b->bucket[k].j;
    if((tab[i][j] < inf) || (tab[i][j] > sup)){
      if(verbose_level >= CRITICAL)
-	  printf("[%d] (%d,%d):%f not in [%f,%f]\n",k,i,j,tab[i][j],inf,sup);
+	fprintf(stderr,"[%d] (%d,%d):%f not in [%f,%f]\n",k,i,j,tab[i][j],inf,sup);
      exit(-1);
    }
  }
@ -197,15 +197,20 @@ void add_to_bucket(int id,int i,int j,bucket_list_t bucket_list)
    n = bucket_list->nb_buckets;
    size = N*N/n;
    /* display_bucket(bucket);*/
-    bucket->bucket = (coord*)realloc(bucket->bucket,sizeof(coord)*(size + bucket->bucket_len));
+    if(verbose_level >= DEBUG){
+      printf("Extending bucket %d (%p) from size %d to size %d!\n",
+	     id,bucket->bucket, bucket->nb_elem, bucket->nb_elem+size);
+    }
+
+    bucket->bucket = (coord*)REALLOC(bucket->bucket,sizeof(coord)*(size + bucket->bucket_len));
    bucket->bucket_len += size;

-    if(verbose_level >= DEBUG){
-      printf("MALLOC/realloc: %d\n",id);
-      printf("(%d,%d)\n",i,j);
-      display_bucket(bucket);
-      printf("\n");
-    }
+    /* if(verbose_level >= DEBUG){ */
+    /*   printf("MALLOC/realloc: %d\n",id); */
+    /*   printf("(%d,%d)\n",i,j); */
+    /*   display_bucket(bucket); */
+    /*   printf("\n"); */
+    /* } */

  }

@ -289,7 +294,13 @@ void partial_sort(bucket_list_t *bl,double **tab,int N)
  bucket_list_t bucket_list;
  int nb_buckets, nb_bits;

-  /* after these operations, nb_bucket is a power of 2 interger close to log2(N)*/
+  if( N <= 0){
+    if(verbose_level >= ERROR )
+      fprintf(stderr,"Error: tryng to group a matrix of size %d<=0!\n",N);
+    return;
+  }
+
+  /* after these operations, nb_buckets is a power of 2 interger close to log2(N)*/

  nb_buckets = (int)floor(CmiLog2(N));

@ -404,7 +415,7 @@ void next_bucket_elem(bucket_list_t bucket_list,int *i,int *j)
 }


-int add_edge_3(tree_t *tab_node, tree_t *parent,int i,int j,int *nb_groups)
+int add_edge_3(tm_tree_t *tab_node, tm_tree_t *parent,int i,int j,int *nb_groups)
 {
  /* printf("%d <-> %d ?\n",tab_node[i].id,tab_node[j].id); */
  if((!tab_node[i].parent) && (!tab_node[j].parent)){
@ -453,7 +464,7 @@ int add_edge_3(tree_t *tab_node, tree_t *parent,int i,int j,int *nb_groups)
  return 0;
 }

-int try_add_edge(tree_t *tab_node, tree_t *parent,int arity,int i,int j,int *nb_groups)
+int try_add_edge(tm_tree_t *tab_node, tm_tree_t *parent,int arity,int i,int j,int *nb_groups)
 {
  assert( i != j );

@ -481,40 +492,40 @@ int try_add_edge(tree_t *tab_node, tree_t *parent,int arity,int i,int j,int *nb_
  }
 }

-void FREE_bucket(bucket_t *bucket)
+void free_bucket(bucket_t *bucket)
 {
  FREE(bucket->bucket);
  FREE(bucket);
 }

-void FREE_tab_bucket(bucket_t **bucket_tab,int N)
+void free_tab_bucket(bucket_t **bucket_tab,int N)
 {
  int i;
  for( i = 0 ; i < N ; i++ )
-    FREE_bucket(bucket_tab[i]);
+    free_bucket(bucket_tab[i]);
  FREE(bucket_tab);
 }

-void FREE_bucket_list(bucket_list_t bucket_list)
+void free_bucket_list(bucket_list_t bucket_list)
 {
-  /* Do not FREE the tab field it is used elsewhere */
-  FREE_tab_bucket(bucket_list->bucket_tab,bucket_list->nb_buckets);
+  /* Do not free the tab field it is used elsewhere */
+  free_tab_bucket(bucket_list->bucket_tab,bucket_list->nb_buckets);
  FREE(bucket_list->pivot);
  FREE(bucket_list->pivot_tree);
  FREE(bucket_list);
 }

-void partial_update_val (int nb_args, void **args){
+void partial_update_val (int nb_args, void **args, int thread_id){
  int inf = *(int*)args[0];
  int sup = *(int*)args[1];
-  affinity_mat_t *aff_mat = (affinity_mat_t*)args[2];
-  tree_t *new_tab_node = (tree_t*)args[3];
+  tm_affinity_mat_t *aff_mat = (tm_affinity_mat_t*)args[2];
+  tm_tree_t *new_tab_node = (tm_tree_t*)args[3];
  double *res=(double*)args[4];
  int l;

-  if(nb_args != 6){
+  if(nb_args != 5){
    if(verbose_level >= ERROR)
-      fprintf(stderr,"Wrong number of args in %s: %d\n",__func__, nb_args);
+      fprintf(stderr,"(Thread: %d) Wrong number of args in %s: %d\n",thread_id, __FUNCTION__, nb_args);
    exit(-1);
  }

@ -524,7 +535,7 @@ void partial_update_val (int nb_args, void **args){
    }
 }

-void bucket_grouping(affinity_mat_t *aff_mat,tree_t *tab_node, tree_t *new_tab_node,
+double bucket_grouping(tm_affinity_mat_t *aff_mat,tm_tree_t *tab_node, tm_tree_t *new_tab_node,
 		     int arity,int M)
 {
  bucket_list_t bucket_list;
@ -536,10 +547,12 @@ void bucket_grouping(affinity_mat_t *aff_mat,tree_t *tab_node, tree_t *new_tab_n
  int N = aff_mat->order;
  double **mat = aff_mat->mat;

-  verbose_level = get_verbose_level();
+  verbose_level = tm_get_verbose_level();
  if(verbose_level >= INFO )
    printf("starting sort of N=%d elements\n",N);

+
+
  TIC;
  partial_sort(&bucket_list,mat,N);
  duration = TOC;
@ -662,8 +675,8 @@ void bucket_grouping(affinity_mat_t *aff_mat,tree_t *tab_node, tree_t *new_tab_n
    printf("Bucket: %d, indice:%d\n",bucket_list->cur_bucket,bucket_list->bucket_indice);
    printf("val=%f\n",val);
  }
-  FREE_bucket_list(bucket_list);
+  free_bucket_list(bucket_list);

-  /*  exit(-1); */
-  /*  display_grouping(new_tab_node,M,arity,val); */
+  return val;
 }
+
--- a/ompi/mca/topo/treematch/treematch/tm_bucket.h
+++ b/ompi/mca/topo/treematch/treematch/tm_bucket.h
@ -28,7 +28,8 @@ typedef struct{

 typedef _bucket_list_t *bucket_list_t;

-void bucket_grouping(affinity_mat_t *aff_mat,tree_t *tab_node, tree_t *new_tab_node,
-		     int arity,int M);
-int try_add_edge(tree_t *tab_node, tree_t *parent,int arity,int i,int j,int *nb_groups);
+double bucket_grouping(tm_affinity_mat_t *aff_mat,tm_tree_t *tab_node, tm_tree_t *new_tab_node, 
+		       int arity,int M);
+int try_add_edge(tm_tree_t *tab_node, tm_tree_t *parent,int arity,int i,int j,int *nb_groups);
 #endif
+
--- a/ompi/mca/topo/treematch/treematch/tm_hwloc.c
+++ b/ompi/mca/topo/treematch/treematch/tm_hwloc.c
@ -1,286 +0,0 @@
-#include "opal/mca/hwloc/hwloc-internal.h"
-#include "tm_tree.h"
-#include "tm_mapping.h"
-#include <ctype.h>
-#include "tm_verbose.h"
-
-
-double ** tm_topology_to_arch(tm_topology_t *topology,double *cost);
-tm_topology_t * tgt_to_tm(char *filename,double **pcost);
-int topo_nb_proc(hwloc_topology_t topology,int N);
-double ** topology_to_arch(hwloc_topology_t topology);
-int symetric(hwloc_topology_t topology);
-tm_topology_t* hwloc_to_tm(char *filename,double **pcost);
-tm_topology_t* get_local_topo_with_hwloc(void);
-
-
-
-
-/* transform a tgt scotch file into a topology file*/
-tm_topology_t * tgt_to_tm(char *filename, double **pcost)
-{
-  tm_topology_t *topology = NULL;
-  FILE *pf = NULL;
-  char line[1024];
-  char *s = NULL;
-  double *cost = NULL;
-  int i;
-
-
-
-  pf = fopen(filename,"r");
-  if(!pf){
-    if(get_verbose_level() >= CRITICAL)
-      fprintf(stderr,"Cannot open %s\n",filename);
-    exit(-1);
-  }
-
-  if(get_verbose_level() >= INFO)
-    printf("Reading TGT file: %s\n",filename);
-
-
-  fgets(line,1024,pf);
-
-  s = strstr(line,"tleaf");
-  if(!s){
-    if(get_verbose_level() >= CRITICAL)
-      fprintf(stderr,"Syntax error! %s is not a tleaf file\n",filename);
-    exit(-1);
-  }
-
-  s += 5;
-  while(isspace(*s))
-    s++;
-
-  topology = (tm_topology_t*)MALLOC(sizeof(tm_topology_t));
-  topology->nb_levels = atoi(strtok(s," "))+1;
-  topology->arity = (int*)MALLOC(sizeof(int)*topology->nb_levels);
-  cost = (double*)CALLOC(topology->nb_levels,sizeof(double));
-
-  for( i = 0 ; i < topology->nb_levels-1 ; i++ ){
-    topology->arity[i] = atoi(strtok(NULL," "));
-    cost[i] = atoi(strtok(NULL," "));
-  }
-
-  topology->arity[topology->nb_levels-1] = 0;
-  /* cost[topology->nb_levels-1]=0; */
-
-  /*aggregate costs*/
-  for( i = topology->nb_levels-2 ; i >= 0 ; i-- )
-    cost[i] += cost[i+1];
-
-  build_synthetic_proc_id(topology);
-
-  *pcost = cost;
-  fclose(pf);
-  /*
-  topology->arity[0]=nb_proc;
-  topology->nb_levels=decompose((int)ceil((1.0*nb_obj)/nb_proc),1,topology->arity);
-  printf("levels=%d\n",topology->nb_levels);
-  */
-  if(get_verbose_level() >= INFO)
-    printf("Topology built from %s!\n",filename);
-
-  return topology;
-}
-
-int topo_nb_proc(hwloc_topology_t topology,int N)
-{
-  hwloc_obj_t *objs = NULL;
-  int nb_proc;
-
-  objs = (hwloc_obj_t*)MALLOC(sizeof(hwloc_obj_t)*N);
-  objs[0] = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU,NULL);
-  nb_proc = 1 + hwloc_get_closest_objs(topology,objs[0],objs+1,N-1);
-  FREE(objs);
-  return nb_proc;
-}
-
-
-double ** topology_to_arch(hwloc_topology_t topology)
-{
-  int nb_proc,i,j;
-  hwloc_obj_t obj_proc1,obj_proc2,obj_res;
-  double **arch = NULL;
-
-  nb_proc = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
-  arch = (double**)MALLOC(sizeof(double*)*nb_proc);
-  for( i = 0 ; i < nb_proc ; i++ ){
-    obj_proc1 = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,i);
-    arch[obj_proc1->os_index] = (double*)MALLOC(sizeof(double)*nb_proc);
-    for( j = 0 ; j < nb_proc ; j++ ){
-      obj_proc2 = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,j);
-      obj_res = hwloc_get_common_ancestor_obj(topology,obj_proc1,obj_proc2);
-      /* printf("arch[%d][%d] <- %ld\n",obj_proc1->os_index,obj_proc2->os_index,*((long int*)(obj_res->userdatab))); */
-      arch[obj_proc1->os_index][obj_proc2->os_index]=speed(obj_res->depth+1);
-    }
-  }
-  return arch;
-}
-
-int symetric(hwloc_topology_t topology)
-{
-   int depth,i,topodepth = hwloc_topology_get_depth(topology);
-   unsigned int arity;
-   hwloc_obj_t obj;
-   for ( depth = 0; depth < topodepth-1 ; depth++ ) {
-    int N = hwloc_get_nbobjs_by_depth(topology, depth);
-    obj = hwloc_get_next_obj_by_depth (topology,depth,NULL);
-    arity = obj->arity;
-
-    /* printf("Depth=%d, N=%d, Arity:%d\n",depth,N,arity); */
-    for (i = 1; i < N; i++ ){
-      obj = hwloc_get_next_obj_by_depth (topology,depth,obj);
-      if( obj->arity != arity){
-	/* printf("[%d]: obj->arity=%d, arity=%d\n",i,obj->arity,arity); */
-	return 0;
-      }
-    }
-   }
-   return 1;
-}
-
-tm_topology_t* hwloc_to_tm(char *filename,double **pcost)
-{
-  hwloc_topology_t topology;
-  tm_topology_t *res = NULL;
-  hwloc_obj_t *objs = NULL;
-  unsigned topodepth,depth;
-  int nb_nodes,i;
-  double *cost;
-  int err;
-
-  /* Build the topology */
-  hwloc_topology_init(&topology);
-  err = hwloc_topology_set_xml(topology,filename);
-  if(err == -1){
-    if(get_verbose_level() >= CRITICAL)
-      fprintf(stderr,"Error: %s is a bad xml topology file!\n",filename);
-    exit(-1);
-  }
-
-#if HWLOC_API_VERSION < 0x20000
-  hwloc_topology_ignore_all_keep_structure(topology);
-#else
-#warning FIXME hwloc v2
-#endif
-  hwloc_topology_load(topology);
-
-
-  /* Test if symetric */
-  if(!symetric(topology)){
-    if(get_verbose_level() >= CRITICAL)
-      fprintf(stderr,"%s not symetric!\n",filename);
-    exit(-1);
-  }
-
-  /* work on depth */
-  topodepth = hwloc_topology_get_depth(topology);
-
-  res = (tm_topology_t*)MALLOC(sizeof(tm_topology_t));
-  res->nb_levels = topodepth;
-  res->node_id = (int**)MALLOC(sizeof(int*)*res->nb_levels);
-  res->nb_nodes = (int*)MALLOC(sizeof(int)*res->nb_levels);
-  res->arity = (int*)MALLOC(sizeof(int)*res->nb_levels);
-
-  if(get_verbose_level() >= INFO)
-      printf("topodepth = %d\n",topodepth);
-
-  /* Build TreeMatch topology */
-  for( depth = 0 ; depth < topodepth ; depth++ ){
-    nb_nodes = hwloc_get_nbobjs_by_depth(topology, depth);
-    res->nb_nodes[depth] = nb_nodes;
-    res->node_id[depth] = (int*)MALLOC(sizeof(int)*nb_nodes);
-
-    objs = (hwloc_obj_t*)MALLOC(sizeof(hwloc_obj_t)*nb_nodes);
-    objs[0] = hwloc_get_next_obj_by_depth(topology,depth,NULL);
-    hwloc_get_closest_objs(topology,objs[0],objs+1,nb_nodes-1);
-    res->arity[depth] = objs[0]->arity;
-
-    if(get_verbose_level() >= INFO)
-      printf("%d(%d):",res->arity[depth],nb_nodes);
-
-    /* Build process id tab */
-    for (i = 0; i < nb_nodes; i++){
-      res->node_id[depth][i] = objs[i]->os_index;
-      /* if(depth==topodepth-1) */
-    }
-    FREE(objs);
-  }
-
-  cost = (double*)CALLOC(res->nb_levels,sizeof(double));
-  for(i=0; i<res->nb_levels; i++){
-    cost[i] = speed(i);
-  }
-
-  *pcost = cost;
-
-
-  /* Destroy topology object. */
-  hwloc_topology_destroy(topology);
-  if(get_verbose_level() >= INFO)
-    printf("\n");
-  return res;
-}
-
-tm_topology_t* get_local_topo_with_hwloc(void)
-{
-  hwloc_topology_t topology;
-  tm_topology_t *res = NULL;
-  hwloc_obj_t *objs = NULL;
-  unsigned topodepth,depth;
-  int nb_nodes,i;
-
-  /* Build the topology */
-  hwloc_topology_init(&topology);
-#if HWLOC_API_VERSION < 0x20000
-  hwloc_topology_ignore_all_keep_structure(topology);
-#else
-#warning FIXME hwloc v2
-#endif
-  hwloc_topology_load(topology);
-
-  /* Test if symetric */
-  if(!symetric(topology)){
-    if(get_verbose_level() >= CRITICAL)
-      fprintf(stderr,"Local toplogy not symetric!\n");
-    exit(-1);
-  }
-
-  /* work on depth */
-  topodepth = hwloc_topology_get_depth(topology);
-
-  res = (tm_topology_t*)MALLOC(sizeof(tm_topology_t));
-  res->nb_levels = topodepth;
-  res->node_id = (int**)MALLOC(sizeof(int*)*res->nb_levels);
-  res->nb_nodes = (int*)MALLOC(sizeof(int)*res->nb_levels);
-  res->arity = (int*)MALLOC(sizeof(int)*res->nb_levels);
-
-  /* Build TreeMatch topology */
-  for( depth = 0 ; depth < topodepth ; depth++ ){
-    nb_nodes = hwloc_get_nbobjs_by_depth(topology, depth);
-    res->nb_nodes[depth] = nb_nodes;
-    res->node_id[depth] = (int*)MALLOC(sizeof(int)*nb_nodes);
-
-    objs = (hwloc_obj_t*)MALLOC(sizeof(hwloc_obj_t)*nb_nodes);
-    objs[0] = hwloc_get_next_obj_by_depth(topology,depth,NULL);
-    hwloc_get_closest_objs(topology,objs[0],objs+1,nb_nodes-1);
-    res->arity[depth] = objs[0]->arity;
-
-    /* printf("%d:",res->arity[depth]); */
-
-    /* Build process id tab */
-    for (i = 0; i < nb_nodes; i++){
-      res->node_id[depth][i] = objs[i]->os_index;
-      /* if(depth==topodepth-1) */
-    }
-    FREE(objs);
-  }
-
-  /* Destroy HWLOC topology object. */
-  hwloc_topology_destroy(topology);
-
-  /* printf("\n"); */
-  return res;
-}
-
--- a/ompi/mca/topo/treematch/treematch/tm_hwloc.h
+++ b/ompi/mca/topo/treematch/treematch/tm_hwloc.h
@ -1,7 +0,0 @@
-#include "opal/mca/hwloc/hwloc-internal.h"
-#include "tm_tree.h"
-
-void hwloc_topology_tag(hwloc_topology_t topology);
-tm_topology_t* hwloc_to_tm(char *filename,double **pcost);
-tm_topology_t * tgt_to_tm(char *filename,double **pcost);
-tm_topology_t* get_local_topo_with_hwloc(void);
--- a/ompi/mca/topo/treematch/treematch/tm_kpartitioning.c
+++ b/ompi/mca/topo/treematch/treematch/tm_kpartitioning.c
@ -1,13 +1,12 @@
 #include "tm_mapping.h"
 #include "tm_mt.h"
 #include "tm_kpartitioning.h"
+#include "k-partitioning.h"
 #include <stdlib.h>
 #include <stdio.h>
+#include "config.h"

 #define USE_KL_KPART 0
-#if USE_KL_KPART
-#include "k-partitioning.h"
-#endif  /* USE_KL_KPART */
 #define KL_KPART_GREEDY_TRIALS 0

 static int verbose_level = ERROR;
@ -15,25 +14,23 @@ static int verbose_level = ERROR;
 #define MAX_TRIALS 10
 #define USE_KL_STRATEGY 1

-#if !defined(MIN)
+
 #define MIN(a,b) ((a)<(b)?(a):(b))
-#endif


 int  fill_tab(int **,int *,int,int,int,int);
-void complete_com_mat(double ***,int,int);
 void complete_obj_weight(double **,int,int);

 void allocate_vertex(int,int *,com_mat_t *,int,int *,int);
 double eval_cost(int *, com_mat_t *);
 int *kpartition_greedy(int, com_mat_t *,int,int *,int);
-constraint_t *split_constraints (int *,int,int,tm_topology_t *,int);
+constraint_t *split_constraints (int *,int,int,tm_topology_t *,int, int);
 com_mat_t **split_com_mat(com_mat_t *,int,int,int *);
 int **split_vertices(int *,int,int,int *);
-void FREE_tab_com_mat(com_mat_t **,int);
-void FREE_tab_local_vertices(int **,int);
-void FREE_const_tab(constraint_t *,int);
-void kpartition_build_level_topology(tree_t *,com_mat_t *,int,int,tm_topology_t *,
+void free_tab_com_mat(com_mat_t **,int);
+void free_tab_local_vertices(int **,int);
+void free_const_tab(constraint_t *,int);
+void kpartition_build_level_topology(tm_tree_t *,com_mat_t *,int,int,tm_topology_t *,
 				     int *,int *,int,double *,double *);


@ -51,10 +48,14 @@ void allocate_vertex(int u, int *res, com_mat_t *com_mat, int n, int *size, int
 	best_part = res[i];
 	break;
      }
+
  }else{
    for( i = 0 ; i < n ; i++){
      if (( res[i] != -1 ) && ( size[res[i]] < max_size )){
 	cost = (((i)<com_mat->n)) ?com_mat->comm[u][i]:0;
+	/* if((n<=16) && (u==8)){ */
+	/*   printf("u=%d, i=%d: %f\n",u, i, cost); */
+	/* } */
 	if (( cost > best_cost)){
 	  best_cost = cost;
 	  best_part = res[i];
@ -62,8 +63,10 @@ void allocate_vertex(int u, int *res, com_mat_t *com_mat, int n, int *size, int
      }
    }
  }
-  /*  printf("size[%d]: %d\n",best_part, size[best_part]);*/
-  /* printf("putting(%.2f): %d -> %d\n",best_cost, u, best_part); */
+  /* if(n<=16){ */
+  /*   printf("size[%d]: %d\n",best_part, size[best_part]); */
+  /*   printf("putting(%.2f): %d -> %d\n",best_cost, u, best_part); */
+  /* } */

  res[u] = best_part;
  size[best_part]++;
@ -84,25 +87,45 @@ double eval_cost(int *partition, com_mat_t *com_mat)

 int  *kpartition_greedy(int k, com_mat_t *com_mat, int n, int *constraints, int nb_constraints)
 {
-  int *res = NULL, *best_res=NULL, *size = NULL;
+  int *partition = NULL, *best_partition=NULL, *size = NULL;
  int i,j,nb_trials;
  int max_size, max_val;
  double cost, best_cost = -1;
  int start, end;
  int dumb_id, nb_dumb;
+  int vl = tm_get_verbose_level();


+  if(nb_constraints > n){
+    if(vl >= ERROR){
+      fprintf(stderr,"Error more constraints (%d) than the problem size (%d)!\n",nb_constraints, n);
+    }
+    return NULL;
+  }
+
+  max_size = n/k;
+
+  if(vl >= DEBUG){
+    printf("max_size = %d (n=%d,k=%d)\ncom_mat->n-1=%d\n",max_size,n,k,com_mat->n-1);
+    printf("nb_constraints = %d\n",nb_constraints);
+
+    if(n<=16){
+      printf("Constraints: ");print_1D_tab(constraints,nb_constraints);
+    }
+  }
+  /* if(com_mat->n){ */
+  /*   printf ("val [n-1][0]= %f\n",com_mat->comm[com_mat->n-1][0]); */
+  /* } */


  for( nb_trials = 0 ; nb_trials < MAX_TRIALS ; nb_trials++ ){
-    res = (int *)MALLOC(sizeof(int)*n);
+    partition = (int *)MALLOC(sizeof(int)*n);
    for ( i = 0 ; i < n ; i ++ )
-      res[i] = -1;
+      partition[i] = -1;

    size = (int *)CALLOC(k,sizeof(int));
-    max_size = n/k;

-    /*printf("Constraints: ");print_1D_tab(constraints,nb_constraints);*/
+

    /* put "dumb" vertices in the correct partition if there are any*/
    if (nb_constraints){
@ -121,12 +144,13 @@ int  *kpartition_greedy(int k, com_mat_t *com_mat, int n, int *constraints, int
 	   number of leaves of the subtree (n/k) and the number of constraints
 	*/
 	nb_dumb = n/k - (end-start);
-	/*printf("max_val: %d, nb_dumb=%d, start=%d, end=%d, size=%d\n",max_val, nb_dumb, start, end, n/k);*/
-
+	/* if(n<=16){ */
+	/*   printf("max_val: %d, nb_dumb=%d, start=%d, end=%d, size=%d\n",max_val, nb_dumb, start, end, n/k); */
+	/* } */
 	/* dumb vertices are the one with highest indices:
 	   put them in the ith partitions*/
 	for( j = 0; j < nb_dumb; j ++ ){
-	  res[dumb_id] = i;
+	  partition[dumb_id] = i;
 	  dumb_id--;
 	}
 	/* increase the size of the ith partition accordingly*/
@ -134,7 +158,10 @@ int  *kpartition_greedy(int k, com_mat_t *com_mat, int n, int *constraints, int
 	start=end;
      }
    }
-    /*printf("After dumb vertices mapping: ");print_1D_tab(res,n);*/
+    /* if(n<=16){ */
+    /*   printf("After dumb vertices mapping: ");print_1D_tab(partition,n); */
+    /* } */
+

    /* choose k initial "true" vertices at random and put them in a different partition */
    for ( i = 0 ; i < k ; i ++ ){
@ -145,35 +172,39 @@ int  *kpartition_greedy(int k, com_mat_t *com_mat, int n, int *constraints, int
      do{
 	/* call the mersenne twister PRNG of tm_mt.c*/
 	j =  genrand_int32() % n;
-      } while ( res[j] != -1 );
+      } while ( partition[j] != -1 );
      /* allocate and update size of partition*/
-      res[j] = i;
-      /* printf("random: %d -> %d\n",j,i); */
+      partition[j] = i;
+      /* if(n<=16){ */
+      /* 	printf("random: %d -> %d\n",j,i); */
+      /* } */
      size[i]++;
    }

    /* allocate each unaloacted vertices in the partition that maximize the communication*/
    for( i = 0 ;  i < n ; i ++)
-      if( res[i] == -1)
-	allocate_vertex(i, res, com_mat, n, size, max_size);
+      if( partition[i] == -1)
+	allocate_vertex(i, partition, com_mat, n, size, max_size);

-    cost = eval_cost(res,com_mat);
-    /*print_1D_tab(res,n);
-    printf("cost=%.2f\n",cost);*/
+    cost = eval_cost(partition,com_mat);
+    /* if(n<=16){ */
+    /*   print_1D_tab(partition,n); */
+    /*   printf("cost=%.2f\n",cost); */
+    /* } */
    if((cost<best_cost) || (best_cost == -1)){
      best_cost=cost;
-      FREE(best_res);
-      best_res=res;
+      FREE(best_partition);
+      best_partition=partition;
    }else
-      FREE(res);
+      FREE(partition);

    FREE(size);
  }

-  /*print_1D_tab(best_res,n);
+  /*print_1D_tab(best_partition,n);
  printf("best_cost=%.2f\n",best_cost);
  */
-  return best_res;
+  return best_partition;
 }

 int *kpartition(int k, com_mat_t *com_mat, int n, int *constraints, int nb_constraints)
@ -189,16 +220,24 @@ int *kpartition(int k, com_mat_t *com_mat, int n, int *constraints, int nb_const
  /* if(USE_KL_KPART) */
  /*   res = kPartitioning(comm, n, k, constraints, nb_constraints, KL_KPART_GREEDY_TRIALS); */
  /* else */
-    res = kpartition_greedy(k, com_mat, n, constraints, nb_constraints);

+
+#if HAVE_LIBSCOTCH
+  printf("Using Scotch\n");
+  res = kpartition_greedy(k, com_mat, n, constraints, nb_constraints);
+#else
+  printf("Using default\n");
+  res = kpartition_greedy(k, com_mat, n, constraints, nb_constraints);
+#endif
  return res;
 }

-constraint_t *split_constraints (int *constraints, int nb_constraints, int k, tm_topology_t *topology, int depth)
+constraint_t *split_constraints (int *constraints, int nb_constraints, int k, tm_topology_t *topology, int depth, int N)
 {
  constraint_t *const_tab = NULL;
  int nb_leaves, start, end;
  int i;
+  int vl = tm_get_verbose_level();

  const_tab = (constraint_t *)CALLOC(k,sizeof(constraint_t));

@ -211,11 +250,27 @@ constraint_t *split_constraints (int *constraints, int nb_constraints, int k, tm
     each sub-contraints 'i' contains constraints of value in [i*nb_leaves,(i+1)*nb_leaves[
   */
  start = 0;
+
+
+
  for( i = 0; i < k; i++ ){
-    /*returns the indice in contsraints that contains the smallest value not copied
+    /*returns the indice in constraints that contains the smallest value not copied
      end is used to compute the number of copied elements (end-size) and is used as the next staring indices*/
    end = fill_tab(&(const_tab[i].constraints), constraints, nb_constraints,start, (i+1) * nb_leaves, i * nb_leaves);
    const_tab[i].length = end-start;
+    if(vl>=DEBUG){
+      printf("Step %d\n",i);
+      printf("\tConstraint: "); print_1D_tab(constraints, nb_constraints);
+      printf("\tSub constraint: "); print_1D_tab(const_tab[i].constraints, end-start);
+    }
+
+    if(end-start > N/k){
+      if(vl >= ERROR){
+	fprintf(stderr, "Error in spliting constraint at step %d. N=%d k= %d, length = %d\n", i, N, k, end-start);
+      }
+      FREE(const_tab);
+      return NULL;
+    }
    const_tab[i].id = i;
    start = end;
  }
@ -224,6 +279,7 @@ constraint_t *split_constraints (int *constraints, int nb_constraints, int k, tm
 }


+/* split the com_mat of order n in k partiton according to parmutition table*/
 com_mat_t **split_com_mat(com_mat_t *com_mat, int n, int k, int *partition)
 {
  com_mat_t **res = NULL, *sub_com_mat;
@ -237,6 +293,8 @@ com_mat_t **split_com_mat(com_mat_t *com_mat, int n, int k, int *partition)
  if(verbose_level >= DEBUG){
    printf("Partition: "); print_1D_tab(partition,n);
    display_tab(com_mat->comm,com_mat->n);
+    printf("m=%d,n=%d,k=%d\n",m,n,k);
+    printf("perm=%p\n",perm);
  }

  perm  = (int*)MALLOC(sizeof(int)*m);
@ -244,10 +302,22 @@ com_mat_t **split_com_mat(com_mat_t *com_mat, int n, int k, int *partition)

    /* build perm such that submat[i][j] correspond to com_mat[perm[i]][perm[j]] according to the partition*/
    s = 0;
-    for( j = 0; j < com_mat->n; j ++) /* check only non zero element of of com_mat*/
+    /* The partition is of size n. n can be larger than the communication matrix order
+       as only the input problem are in the communication matrix while n is of the size
+       of all the element (including the added one where it is possible to map computation) :
+       we can have more compute units than processes*/
+    for( j = 0; j < com_mat->n; j ++)
      if ( partition[j] == cur_part )
 	perm[s++] = j;

+    if(s>m){
+      if(verbose_level >= CRITICAL){
+	fprintf(stderr,"Partition: "); print_1D_tab(partition,n);
+	display_tab(com_mat->comm,com_mat->n);
+	fprintf(stderr,"too many elements of the partition for the permuation (s=%d>%d=m). n=%d, k=%d, cur_part= %d\n",s,m,n,k, cur_part);
+      }
+      exit(-1);
+    }
    /* s is now the size of the non zero sub matrix for this partition*/
    /* built a sub-matrix for partition cur_part*/
    sub_mat = (double **) MALLOC(sizeof(double *) * s);
@ -264,7 +334,7 @@ com_mat_t **split_com_mat(com_mat_t *com_mat, int n, int k, int *partition)
      }
    }

-    sub_com_mat = (com_mat_t *)malloc(sizeof(com_mat_t));
+    sub_com_mat = (com_mat_t *)MALLOC(sizeof(com_mat_t));
    sub_com_mat -> n = s;
    sub_com_mat -> comm = sub_mat;

@ -275,7 +345,7 @@ com_mat_t **split_com_mat(com_mat_t *com_mat, int n, int k, int *partition)
    res[cur_part] = sub_com_mat;
  }

-  FREE(perm);
+ FREE(perm);

  return res;
 }
@ -311,7 +381,7 @@ int **split_vertices( int *vertices, int n, int k, int *partition)
  return res;
 }

-void FREE_tab_com_mat(com_mat_t **mat,int k)
+void free_tab_com_mat(com_mat_t **mat,int k)
 {
  int i,j;
  if( !mat )
@ -321,11 +391,13 @@ void FREE_tab_com_mat(com_mat_t **mat,int k)
    for ( j = 0 ; j < mat[i]->n ; j ++)
      FREE( mat[i]->comm[j] );
    FREE( mat[i]->comm );
+    FREE(mat[i]);
+
  }
  FREE(mat);
 }

-void FREE_tab_local_vertices(int **mat, int k)
+void free_tab_local_vertices(int **mat, int k)
 {
  int i; /* m=n/k; */
  if( !mat )
@ -338,7 +410,7 @@ void FREE_tab_local_vertices(int **mat, int k)
 }


-void FREE_const_tab(constraint_t *const_tab, int k)
+void free_const_tab(constraint_t *const_tab, int k)
 {
  int i;

@ -353,19 +425,33 @@ void FREE_const_tab(constraint_t *const_tab, int k)
  FREE(const_tab);
 }

-void kpartition_build_level_topology(tree_t *cur_node, com_mat_t *com_mat, int N, int depth,
+
+void check_com_mat(com_mat_t *com_mat){
+  int i,j;
+
+  for( i = 0 ; i < com_mat->n ; i++ )
+    for( j = 0 ; j < com_mat->n ; j++ )
+      if(com_mat->comm[i][j]<0){
+	printf("com_mat->comm[%d][%d]= %f\n",i,j,com_mat->comm[i][j]);
+	exit(-1);
+      }
+
+
+}
+
+void kpartition_build_level_topology(tm_tree_t *cur_node, com_mat_t *com_mat, int N, int depth,
 				     tm_topology_t *topology, int *local_vertices,
 				     int *constraints, int nb_constraints,
 				     double *obj_weight, double *comm_speed)
 {
  com_mat_t **tab_com_mat = NULL; /* table of comunication matrix. We will have k of such comunication matrix, one for each subtree */
  int k = topology->arity[depth];
-  tree_t **tab_child = NULL;
+  tm_tree_t **tab_child = NULL;
  int *partition = NULL;
  int **tab_local_vertices = NULL;
  constraint_t *const_tab = NULL;
  int i;
-  verbose_level = get_verbose_level();
+  verbose_level = tm_get_verbose_level();

  /* if we are at the bottom of the tree set cur_node
   and return*/
@ -377,8 +463,14 @@ void kpartition_build_level_topology(tree_t *cur_node, com_mat_t *com_mat, int N
  }


+  if(verbose_level >= DEBUG){
+    printf("Partitionning Matrix of size %d (problem size= %d) in %d partitions\n", com_mat->n, N, k);
+  }
+
+  /* check_com_mat(com_mat); */
+
  /* partition the com_matrix in k partitions*/
-  partition = kpartition(topology->arity[depth], com_mat, N, constraints, nb_constraints);
+  partition = kpartition(k, com_mat, N, constraints, nb_constraints);

  /* split the communication matrix in k parts according to the partition just found above */
  tab_com_mat = split_com_mat( com_mat, N, k, partition);
@ -387,12 +479,12 @@ void kpartition_build_level_topology(tree_t *cur_node, com_mat_t *com_mat, int N
  tab_local_vertices = split_vertices( local_vertices, N, k, partition);

  /* construct a tab of constraints of  size k: one for each partitions*/
-  const_tab = split_constraints (constraints, nb_constraints, k, topology, depth);
+  const_tab = split_constraints (constraints, nb_constraints, k, topology, depth, N);

  /* create the table of k nodes of the resulting sub-tree */
-  tab_child = (tree_t **) CALLOC (k,sizeof(tree_t*));
+  tab_child = (tm_tree_t **) CALLOC (k,sizeof(tm_tree_t*));
  for( i = 0 ; i < k ; i++){
-    tab_child[i] = (tree_t *) MALLOC(sizeof(tree_t));
+    tab_child[i] = (tm_tree_t *) MALLOC(sizeof(tm_tree_t));
  }

  /* for each child, proceeed recursively*/
@ -408,28 +500,30 @@ void kpartition_build_level_topology(tree_t *cur_node, com_mat_t *com_mat, int N
  /* link the node with its child */
  set_node( cur_node, tab_child, k, NULL, cur_node->id, 0, NULL, depth);

-  /* FREE local data*/
+  /* free local data*/
  FREE(partition);
-  FREE_tab_com_mat(tab_com_mat,k);
-  FREE_tab_local_vertices(tab_local_vertices,k);
-  FREE_const_tab(const_tab,k);
+  free_tab_com_mat(tab_com_mat,k);
+  free_tab_local_vertices(tab_local_vertices,k);
+  free_const_tab(const_tab,k);
 }


-tree_t *kpartition_build_tree_from_topology(tm_topology_t *topology,double **comm,int N, int *constraints, int nb_constraints, double *obj_weight, double *com_speed)
+tm_tree_t *kpartition_build_tree_from_topology(tm_topology_t *topology,double **comm,int N, int *constraints, int nb_constraints, double *obj_weight, double *com_speed)
 {
  int depth,i, K;
-  tree_t *root = NULL;
+  tm_tree_t *root = NULL;
  int *local_vertices = NULL;
  int nb_cores;
  com_mat_t com_mat;

-  verbose_level = get_verbose_level();
+  verbose_level = tm_get_verbose_level();
+
+
+  nb_cores=nb_processing_units(topology)*topology->oversub_fact;
+

  if(verbose_level>=INFO)
-    printf("Number of constraints: %d, N=%d\n", nb_constraints, N);
-
-  nb_cores=nb_processing_units(topology);
+    printf("Number of constraints: %d, N=%d, nb_cores = %d, K=%d\n", nb_constraints, N, nb_cores, nb_cores-N);

  if((constraints == NULL) && (nb_constraints != 0)){
    if(verbose_level>=ERROR)
@ -449,7 +543,6 @@ tree_t *kpartition_build_tree_from_topology(tm_topology_t *topology,double **com
  if((K=nb_cores - N)>0){
    /* add K element to the object weight*/
    complete_obj_weight(&obj_weight,N,K);
-    /* display_tab(tab,N+K);*/
  } else if( K < 0){
    if(verbose_level>=ERROR)
      fprintf(stderr,"Not enough cores!\n");
@ -463,7 +556,7 @@ tree_t *kpartition_build_tree_from_topology(tm_topology_t *topology,double **com
     local_vertices is the array of vertices that can be used
     the min(N,nb_contraints) 1st element are number from 0 to N
     the last ones have value -1
-     the value of this array will be used to number the leaves of the tree_t tree
+     the value of this array will be used to number the leaves of the tm_tree_t tree
     that start at "root"

     min(N,nb_contraints) is used to takle the case where thre is less processes than constraints
@ -479,18 +572,20 @@ tree_t *kpartition_build_tree_from_topology(tm_topology_t *topology,double **com

  /* we assume all objects have the same arity*/
  /* assign the root of the tree*/
-  root = (tree_t*) MALLOC (sizeof(tree_t));
-  root->id = 0;
+  root = (tm_tree_t*) MALLOC (sizeof(tm_tree_t));
+  root -> id = 0;
+

  /*build the tree downward from the root*/
  kpartition_build_level_topology(root, &com_mat, N+K,  depth, topology, local_vertices,
-                                  constraints, nb_constraints, obj_weight, com_speed);
+					constraints, nb_constraints, obj_weight, com_speed);

  /*print_1D_tab(local_vertices,K+N);*/
  if(verbose_level>=INFO)
    printf("Build (bottom-up) tree done!\n");


+
  FREE(local_vertices);


--- a/ompi/mca/topo/treematch/treematch/tm_kpartitioning.h
+++ b/ompi/mca/topo/treematch/treematch/tm_kpartitioning.h
@ -1,9 +1,9 @@
 typedef struct _com_mat_t{
-  double **comm;
+  double **comm; 
  int n;  /*comm is of size n by n the other element are zeroes*/
-
+  
 } com_mat_t;


 int  *kpartition(int, com_mat_t*, int, int *, int);
-tree_t * kpartition_build_tree_from_topology(tm_topology_t *topology,double **com_mat,int N, int *constraints, int nb_constraints, double *obj_weight, double *com_speed);
+tm_tree_t * kpartition_build_tree_from_topology(tm_topology_t *topology,double **com_mat,int N, int *constraints, int nb_constraints, double *obj_weight, double *com_speed);
--- a/ompi/mca/topo/treematch/treematch/tm_malloc.c
+++ b/ompi/mca/topo/treematch/treematch/tm_malloc.c
@ -1,35 +1,60 @@
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <stdint.h>
 #include "uthash.h"
 #include <stdio.h>
 #include "tm_verbose.h"
 #include "tm_malloc.h"
-#include "opal/util/alfg.h"
+#include "tm_tree.h"
+#include "tm_mt.h"
+
+
+#define MIN(a,b) ((a)<(b)?(a):(b))

 #define EXTRA_BYTE 100

-typedef signed char  byte;
+typedef uint8_t  byte;


 /* static int verbose_level = ERROR;*/

 typedef struct _hash_t {
-    void *key;            /* we'll use this field as the key */
-    size_t size;
-    UT_hash_handle hh; /* makes this structure hashable */
+  void   *key;            /* we'll use this field as the key */
+  size_t size;
+  char   *file;
+  int    line;
+  UT_hash_handle hh; /* makes this structure hashable */
 }hash_t;

 static hash_t *size_hash = NULL;
 static char extra_data[EXTRA_BYTE];

-static void save_size(void *ptr, size_t size);
+static void save_ptr(void *ptr, size_t size, char *file, int line);
 static size_t retreive_size(void *someaddr);
 static void init_extra_data(void);

-void save_size(void *ptr, size_t size) {
+
+
+char *my_strdup(char* string){
+  int size = 1+strlen(string);
+  char *res = (char*)malloc(size*sizeof(char));
+
+  if(res)
+    memcpy(res, string, size*sizeof(char));
+
+  return res;
+
+}
+
+void save_ptr(void *ptr, size_t size, char *file, int line) {
  hash_t *elem;
  elem = (hash_t*) malloc(sizeof(hash_t));
-  elem -> key = ptr;
+  elem -> key  = ptr;
  elem -> size = size;
-  if(get_verbose_level() >= DEBUG)
+  elem -> line = line;
+  elem -> file = my_strdup(file);
+  if(tm_get_verbose_level() >= DEBUG)
    printf("Storing (%p,%ld)\n",ptr,size);
  HASH_ADD_PTR( size_hash, key, elem );
 }
@ -40,72 +65,76 @@ size_t retreive_size(void *someaddr){
  hash_t *elem = NULL;
  HASH_FIND_PTR(size_hash, &someaddr, elem);
  if(!elem){
-    fprintf(stderr,"cannot find ptr %p to free!\n",someaddr);
+    if(tm_get_verbose_level() >= CRITICAL)
+      fprintf(stderr,"Cannot find ptr %p to free!\n",someaddr);
+    abort();
    return 0;
  }

  res  = elem->size;
-  if(get_verbose_level()>=DEBUG)
+  if(tm_get_verbose_level()>=DEBUG)
    printf("Retreiving (%p,%ld)\n",someaddr, res);

+  free(elem->file);
  HASH_DEL( size_hash, elem);
  return res;
 }

-void my_mem_check(void){
+void tm_mem_check(void){
+#ifdef __DEBUG_TM_MALLOC__
    hash_t  *s;
    int nb_errors = 0;
    for(s=size_hash; s != NULL; s=s->hh.next) {
-      if(get_verbose_level() >= ERROR) {
-        printf("pointer %p of size %ld has not been freed!\n", s->key, s->size);
-      }
-      nb_errors ++;
+      if(tm_get_verbose_level()>=ERROR)
+        printf("pointer %p of size %ld (%s: %d) has not been freed!\n", s->key, s->size, s->file, s->line);
+	nb_errors ++;
    }

-    if(get_verbose_level() >= INFO)
+    if(tm_get_verbose_level() >= INFO)
      printf ("Number of errors in managing memory: %d\n",nb_errors);
+#endif
 }

 void init_extra_data(void){
  static int done = 0;
-  opal_rng_buff_t rng;
  int i;

  if(done)
    return;

-  opal_srand(&rng,0);
+  init_genrand(0);

  for( i = 0 ; i < EXTRA_BYTE; i++)
-    extra_data[i] = (char) opal_rand(&rng) % 256;
+    extra_data[i] = (char) genrand_int32() % 256;

  done = 1;
 }


-void *my_malloc(size_t size, char *file, int line){
+void *tm_malloc(size_t size, char *file, int line){
  byte *ptr;
  init_extra_data();

  size+=2*EXTRA_BYTE;
  ptr = malloc(size);

-  if(get_verbose_level()>=DEBUG)
-    printf("my_malloc of size %ld: %p (%s: %d)\n",size-2*EXTRA_BYTE,(void*)ptr,file,line);
+  if(tm_get_verbose_level()>=DEBUG)
+    printf("tm_malloc of size %ld: %p (%s: %d)\n",size-2*EXTRA_BYTE,ptr,file,line);

-  save_size(ptr,size);
+  save_ptr(ptr, size, file, line);

  memcpy(ptr, extra_data, EXTRA_BYTE);
  memcpy(ptr + size - EXTRA_BYTE, extra_data, EXTRA_BYTE);


-  if(get_verbose_level()>=DEBUG)
-    printf("my_malloc returning: %p\n",(void*)(ptr+EXTRA_BYTE));
+  if(tm_get_verbose_level()>=DEBUG)
+    printf("tm_malloc returning: %p\n",ptr+EXTRA_BYTE);

  return (void *)(ptr + EXTRA_BYTE);
 }

-void *my_calloc(size_t count, size_t size, char *file, int line){
+
+void *tm_calloc(size_t count, size_t size, char *file, int line){
  byte *ptr;
  size_t full_size;

@ -115,22 +144,72 @@ void *my_calloc(size_t count, size_t size, char *file, int line){

  ptr = malloc(full_size);
  bzero(ptr,full_size);
-  save_size(ptr, full_size);
+  save_ptr(ptr, full_size, file, line);

-  if(get_verbose_level()>=DEBUG)
-    printf("my_calloc of size %ld: %p (%s: %d)\n",full_size-2*EXTRA_BYTE,(void*)ptr, file, line);
+  if(tm_get_verbose_level()>=DEBUG)
+    printf("tm_calloc of size %ld: %p (%s: %d)\n",full_size-2*EXTRA_BYTE,ptr, file, line);


  memcpy(ptr, extra_data, EXTRA_BYTE);
  memcpy(ptr + full_size - EXTRA_BYTE, extra_data, EXTRA_BYTE);

-  if(get_verbose_level()>=DEBUG)
-    printf("my_calloc returning: %p\n",(void*)(ptr+EXTRA_BYTE));
+  if(tm_get_verbose_level()>=DEBUG)
+    printf("tm_calloc returning: %p\n",ptr+EXTRA_BYTE);

  return (void *)(ptr+EXTRA_BYTE);
 }

-void my_free(void *ptr){
+
+void *tm_realloc(void *old_ptr, size_t size, char *file, int line){
+  byte *ptr;
+  size_t full_size;
+
+  init_extra_data();
+
+  full_size = size + 2 * EXTRA_BYTE;
+
+  ptr = malloc(full_size);
+  save_ptr(ptr, full_size, file, line);
+
+  if(tm_get_verbose_level()>=DEBUG)
+    printf("tm_realloc of size %ld: %p (%s: %d)\n",full_size-2*EXTRA_BYTE,ptr, file, line);
+
+
+  memcpy(ptr, extra_data, EXTRA_BYTE);
+  memcpy(ptr + full_size - EXTRA_BYTE, extra_data, EXTRA_BYTE);
+
+  if(old_ptr){
+    byte *original_ptr = ((byte *)old_ptr) - EXTRA_BYTE;
+    size_t old_ptr_size = retreive_size(original_ptr);
+
+    memcpy(ptr + EXTRA_BYTE, old_ptr, MIN(old_ptr_size - 2 * EXTRA_BYTE, size));
+
+    if((bcmp(original_ptr ,extra_data, EXTRA_BYTE)) && ((tm_get_verbose_level()>=ERROR))){
+      fprintf(stderr,"Realloc: cannot find special string ***before*** %p!\n", original_ptr);
+      fprintf(stderr,"memory is probably corrupted here!\n");
+    }
+
+    if((bcmp(original_ptr + old_ptr_size -EXTRA_BYTE ,extra_data, EXTRA_BYTE)) && ((tm_get_verbose_level()>=ERROR))){
+      fprintf(stderr,"Realloc: cannot find special string ***after*** %p!\n", original_ptr);
+      fprintf(stderr,"memory is probably corrupted here!\n");
+    }
+
+    if(tm_get_verbose_level()>=DEBUG)
+      printf("tm_free freeing: %p\n",original_ptr);
+
+
+    free(original_ptr);
+  }
+
+
+  if(tm_get_verbose_level()>=DEBUG)
+    printf("tm_realloc returning: %p (----- %p)\n",ptr+EXTRA_BYTE, ((byte *)ptr) - EXTRA_BYTE);
+
+
+  return (void *)(ptr+EXTRA_BYTE);
+}
+
+void tm_free(void *ptr){
  byte *original_ptr = ((byte *)ptr) - EXTRA_BYTE;
  size_t size;

@ -139,18 +218,18 @@ void my_free(void *ptr){

  size = retreive_size(original_ptr);

-  if((bcmp(original_ptr ,extra_data, EXTRA_BYTE)) && ((get_verbose_level()>=ERROR))){
-    fprintf(stderr,"cannot find special string ***before*** %p!\n",ptr);
+  if((bcmp(original_ptr ,extra_data, EXTRA_BYTE)) && ((tm_get_verbose_level()>=ERROR))){
+    fprintf(stderr,"Free: cannot find special string ***before*** %p!\n", original_ptr);
    fprintf(stderr,"memory is probably corrupted here!\n");
  }

-  if((bcmp(original_ptr + size -EXTRA_BYTE ,extra_data, EXTRA_BYTE)) && ((get_verbose_level()>=ERROR))){
-    fprintf(stderr,"cannot find special string ***after*** %p!\n",ptr);
+  if((bcmp(original_ptr + size -EXTRA_BYTE ,extra_data, EXTRA_BYTE)) && ((tm_get_verbose_level()>=ERROR))){
+    fprintf(stderr,"Free: cannot find special string ***after*** %p!\n", original_ptr);
    fprintf(stderr,"memory is probably corrupted here!\n");
  }

-  if(get_verbose_level()>=DEBUG)
-    printf("my_free freeing: %p\n",(void*)original_ptr);
+  if(tm_get_verbose_level()>=DEBUG)
+    printf("tm_free freeing: %p\n",original_ptr);


  free(original_ptr);
--- a/ompi/mca/topo/treematch/treematch/tm_malloc.h
+++ b/ompi/mca/topo/treematch/treematch/tm_malloc.h
@ -1,5 +1,29 @@
+#ifndef _TM_MALLOC_H_
+#define _TM_MALLOC_H_
+
 #include <stdlib.h>
-void *my_malloc(size_t size, char *, int);
-void *my_calloc(size_t count, size_t size, char *, int);
-void my_free(void *ptr);
-void my_mem_check(void);
+void *tm_malloc(size_t size, char *, int);
+void *tm_calloc(size_t count, size_t size, char *, int);
+void *tm_realloc(void *ptr, size_t size, char *, int);
+void tm_free(void *ptr);
+void tm_mem_check(void);
+
+/* for debugging malloc */
+/* #define __DEBUG_TM_MALLOC__ */
+#undef __DEBUG_TM_MALLOC__
+#ifdef __DEBUG_TM_MALLOC__
+#define MALLOC(x) tm_malloc(x,__FILE__,__LINE__)
+#define CALLOC(x,y) tm_calloc(x,y,__FILE__,__LINE__)
+#define REALLOC(x,y) tm_realloc(x,y,__FILE__,__LINE__)
+#define FREE   tm_free
+#define MEM_CHECK tm_mem_check
+#else
+#define MALLOC    malloc
+#define CALLOC    calloc
+#define FREE      free
+#define REALLOC   realloc
+#define MEM_CHECK tm_mem_check
+#endif
+
+
+#endif
--- a/ompi/mca/topo/treematch/treematch/tm_mapping.c
+++ b/ompi/mca/topo/treematch/treematch/tm_mapping.c
--- a/ompi/mca/topo/treematch/treematch/tm_mapping.h
+++ b/ompi/mca/topo/treematch/treematch/tm_mapping.h
@ -1,43 +1,34 @@
+#ifndef __TM_MAPPING_H__
+#define __TM_MAPPING_H__
 #include "tm_tree.h"
-#include "tm_hwloc.h"
+#include "tm_topology.h"
 #include "tm_timings.h"
 #include "tm_verbose.h"

-int  build_comm(char *filename,double ***pcomm);
-void TreeMatchMapping(int nb_obj, int nb_proc,double **comm_mat,  double * obj_weigth, double *com_speed, int d, int *sol);
-
-/*Map topology to cores:
- sigma_i is such that  process i is mapped on core sigma_i
- k_i is such that core i exectutes process k_i
-
- size of sigma is the number of process (nb_objs)
- size of k is the number of cores/nodes (nb_proc)
-
- We must have numbe of process<=number of cores
-
- k_i =-1 if no process is mapped on core i
-*/
-void map_topology_simple(tm_topology_t *topology,tree_t *comm_tree, int *sigma, int nb_processes, int *k);
-
-int nb_processing_units(tm_topology_t *topology);
-void free_topology(tm_topology_t *topology);
-void display_other_heuristics(tm_topology_t *topology,int N,double **comm,int TGT_flag, int *constraints, double *cost);
-void print_1D_tab(int *tab,int N);
+tm_affinity_mat_t * new_affinity_mat(double **mat, double *sum_row, int order);
 void   build_synthetic_proc_id(tm_topology_t *topology);
-void display_topology(tm_topology_t *topology);
-tm_topology_t  *build_synthetic_topology(int *arity, int nb_levels, int *core_numbering, int nb_core_per_node);
-tm_topology_t  *optimize_topology(tm_topology_t *topology);
-double print_sol_inv(int N,int *Value,double **comm, double *cost, tm_topology_t *topology);
-double print_sol(int N,int *Value,double **comm, double *cost, tm_topology_t *topology);
-int  build_binding_constraints(char *filename, int **ptab);
-void canonize_constraints(tm_topology_t *topology, int *constraints, int **canonical, int n, int **perm, int *m);
+tm_topology_t  *build_synthetic_topology(int *arity, int nb_levels, int *core_numbering, int nb_core_per_nodes);
 int compute_nb_leaves_from_level(int depth,tm_topology_t *topology);
-void FREE_topology(tm_topology_t *);
-
+void depth_first(tm_tree_t *comm_tree, int *proc_list,int *i);
+int  fill_tab(int **new_tab,int *tab, int n, int start, int max_val, int shift);
+void init_mat(char *filename,int N, double **mat, double *sum_row);
+void map_topology(tm_topology_t *topology,tm_tree_t *comm_tree, int level,
+		  int *sigma, int nb_processes, int **k, int nb_compute_units);
+int nb_leaves(tm_tree_t *comm_tree);
+int nb_lines(char *filename);
+int nb_processing_units(tm_topology_t *topology);
+void print_1D_tab(int *tab,int N);
+tm_solution_t * tm_compute_mapping(tm_topology_t *topology,tm_tree_t *comm_tree);
+void tm_finalize();
+void tm_free_affinity_mat(tm_affinity_mat_t *aff_mat);
+tm_affinity_mat_t *tm_load_aff_mat(char *filename);
+void update_comm_speed(double **comm_speed,int old_size,int new_size);

 /* use to split a constaint into subconstraint according the tree*/
-typedef struct _constraint{
+typedef struct{
  int *constraints; /* the subconstraints*/
  int length; /*length of *constraints*/
  int id;  /* id of the corresponding subtree*/
 }constraint_t;
+
+#endif
--- a/ompi/mca/topo/treematch/treematch/tm_mt.h
+++ b/ompi/mca/topo/treematch/treematch/tm_mt.h
@ -2,8 +2,7 @@ void init_genrand(unsigned long s);
 void init_by_array(unsigned long init_key[], int key_length);

 /* generates a random number on the interval [0,0x7fffffff] */
-unsigned long genrand_int32(void);
-
+unsigned long genrand_int32(void); 
 long genrand_int31(void);
 double genrand_real1(void);
 double genrand_real2(void);
--- a/ompi/mca/topo/treematch/treematch/tm_solution.c
+++ b/ompi/mca/topo/treematch/treematch/tm_solution.c
@ -0,0 +1,525 @@
+#include <ctype.h>
+#include <float.h>
+#include "tm_solution.h"
+#include "tm_mt.h"
+#include "tm_mapping.h"
+
+typedef struct {
+  int  val;
+  long key;
+} hash_t;
+
+
+
+void tm_free_solution(tm_solution_t *sol);
+int distance(tm_topology_t *topology,int i, int j);
+double display_sol_sum_com(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, int *sigma);
+  double display_sol(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, int *sigma, tm_metric_t metric);
+double tm_display_solution(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, tm_solution_t *sol,
+			   tm_metric_t metric);
+void tm_display_other_heuristics(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, tm_metric_t metric);
+int in_tab(int *tab, int n, int val);
+void map_Packed(tm_topology_t *topology, int N, int *sigma);
+void map_RR(tm_topology_t * topology, int N, int *sigma);
+int hash_asc(const void* x1,const void* x2);
+int *generate_random_sol(tm_topology_t *topology,int N,int level,int seed);
+double eval_sol(int *sol,int N,double **comm, double **arch);
+void exchange(int *sol,int i,int j);
+double gain_exchange(int *sol,int l,int m,double eval1,int N,double **comm, double **arch);
+void select_max(int *l,int *m,double **gain,int N,int *state);
+void compute_gain(int *sol,int N,double **gain,double **comm, double **arch);
+void map_MPIPP(tm_topology_t *topology,int nb_seed,int N,int *sigma,double **comm, double **arch);
+
+
+void tm_free_solution(tm_solution_t *sol){
+  int i,n;
+
+  n = sol->k_length;
+
+  if(sol->k)
+    for(i=0 ; i<n ; i++)
+      FREE(sol->k[i]);
+
+  FREE(sol->k);
+  FREE(sol->sigma);
+  FREE(sol);
+}
+
+/*
+   Compute the distance in the tree
+   between node i and j : the farther away node i and j, the
+   larger the returned value.
+
+   The algorithm looks at the largest level, starting from the top,
+   for which node i and j are still in the same subtree. This is done
+   by iteratively dividing their numbering by the arity of the levels
+*/
+int distance(tm_topology_t *topology,int i, int j)
+{
+  int level = 0;
+  int arity;
+  int f_i, f_j ;
+  int vl = tm_get_verbose_level();
+  int depth = topology->nb_levels-1;
+
+  f_i = topology->node_rank[depth][i];
+  f_j = topology->node_rank[depth][j];
+
+  if(vl >= DEBUG)
+    printf("i=%d, j=%d Level = %d f=(%d,%d)\n",i ,j, level, f_i, f_j);
+
+
+  do{
+    level++;
+    arity = topology->arity[level];
+    if( arity == 0 )
+      arity = 1;
+    f_i = f_i/arity;
+    f_j = f_j/arity;
+  } while((f_i!=f_j) && (level < depth));
+
+  if(vl >= DEBUG)
+    printf("distance(%d,%d):%d\n",topology->node_rank[depth][i], topology->node_rank[depth][j], level);
+  /* exit(-1); */
+  return level;
+}
+
+double display_sol_sum_com(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, int *sigma)
+{
+  double a,c,sol;
+  int i,j;
+  double *cost = topology->cost;
+  double **mat = aff_mat->mat;
+  int N = aff_mat->order;
+  int depth = topology->nb_levels - 1;
+
+
+  sol = 0;
+  for ( i = 0 ; i < N ; i++ )
+    for ( j = i+1 ; j < N ; j++){
+      c = mat[i][j];
+      /*
+	   Compute cost in funvtion of the inverse of the distance
+	   This is due to the fact that the cost matrix is numbered
+	   from top to bottom : cost[0] is the cost of the longest distance.
+      */
+      a = cost[depth-distance(topology,sigma[i],sigma[j])];
+      if(tm_get_verbose_level() >= DEBUG)
+	printf("T_%d_%d %f*%f=%f\n",i,j,c,a,c*a);
+      sol += c*a;
+    }
+
+  for (i = 0; i < N; i++) {
+    printf("%d", sigma[i]);
+    if(i<N-1)
+      printf(",");
+  }
+  printf(" : %g\n",sol);
+
+  return sol;
+}
+
+
+double display_sol_max_com(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, int *sigma)
+{
+  double a,c,sol;
+  int i,j;
+  double *cost = topology->cost;
+  double **mat = aff_mat->mat;
+  int N = aff_mat->order;
+  int vl = tm_get_verbose_level();
+  int depth = topology->nb_levels - 1;
+
+  sol = 0;
+  for ( i = 0 ; i < N ; i++ )
+    for ( j = i+1 ; j < N ; j++){
+      c = mat[i][j];
+      /*
+	   Compute cost in funvtion of the inverse of the distance
+	   This is due to the fact that the cost matrix is numbered
+	   from top to bottom : cost[0] is the cost of the longest distance.
+      */
+      a = cost[depth-distance(topology,sigma[i],sigma[j])];
+      if(vl >= DEBUG)
+	printf("T_%d_%d %f*%f=%f\n",i,j,c,a,c*a);
+      if(c*a > sol)
+	sol = c*a;
+    }
+
+  for (i = 0; i < N; i++) {
+    printf("%d", sigma[i]);
+    if(i<N-1)
+      printf(",");
+  }
+  printf(" : %g\n",sol);
+
+  return sol;
+}
+
+double display_sol_hop_byte(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, int *sigma)
+{
+  double c,sol;
+  int nb_hops;
+  int i,j;
+  double **mat = aff_mat->mat;
+  int N = aff_mat->order;
+
+  sol = 0;
+  for ( i = 0 ; i < N ; i++ )
+    for ( j = i+1 ; j < N ; j++){
+      c = mat[i][j];
+      nb_hops = 2*distance(topology,sigma[i],sigma[j]);
+      if(tm_get_verbose_level() >= DEBUG)
+	printf("T_%d_%d %f*%d=%f\n",i,j,c,nb_hops,c*nb_hops);
+      sol += c*nb_hops;
+    }
+
+  for (i = 0; i < N; i++) {
+    printf("%d", sigma[i]);
+    if(i<N-1)
+      printf(",");
+  }
+  printf(" : %g\n",sol);
+
+  return sol;
+}
+
+
+
+double display_sol(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, int *sigma, tm_metric_t metric){
+  switch (metric){
+  case TM_METRIC_SUM_COM:
+    return display_sol_sum_com(topology, aff_mat, sigma);
+  case TM_METRIC_MAX_COM:
+    return display_sol_max_com(topology, aff_mat, sigma);
+  case TM_METRIC_HOP_BYTE:
+    return display_sol_hop_byte(topology, aff_mat, sigma);
+  default:
+    if(tm_get_verbose_level() >= ERROR){
+      fprintf(stderr,"Error printing solution: metric %d not implemented\n",metric);
+      return -1;
+    }
+  }
+  return -1;
+}
+
+double tm_display_solution(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, tm_solution_t *sol,
+			   tm_metric_t metric){
+
+  int i,j;
+  int **k = sol->k;
+
+
+  if(tm_get_verbose_level() >= DEBUG){
+    printf("k: \n");
+    for( i = 0 ; i < nb_processing_units(topology) ; i++ ){
+      if(k[i][0] != -1){
+	printf("\tProcessing unit %d: ",i);
+	for (j = 0 ; j<topology->oversub_fact; j++){
+	  if( k[i][j] == -1)
+	    break;
+	  printf("%d ",k[i][j]);
+	}
+	printf("\n");
+      }
+    }
+  }
+
+
+  return display_sol(topology, aff_mat, sol->sigma, metric);
+}
+
+void tm_display_other_heuristics(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, tm_metric_t metric)
+{
+  int *sigma = NULL;
+  int N  = aff_mat->order;
+
+  sigma = (int*)MALLOC(sizeof(int)*N);
+
+  map_Packed(topology, N, sigma);
+  printf("Packed: ");
+  display_sol(topology, aff_mat, sigma, metric);
+
+  map_RR(topology, N, sigma);
+  printf("RR: ");
+  display_sol(topology, aff_mat, sigma, metric);
+
+/*   double duration; */
+/*   CLOCK_T time1,time0; */
+/*   CLOCK(time0); */
+/*   map_MPIPP(topology,1,N,sigma,comm,arch); */
+/*   CLOCK(time1); */
+/*   duration=CLOCK_DIFF(time1,time0); */
+/*   printf("MPIPP-1-D:%f\n",duration); */
+/*   printf("MPIPP-1: "); */
+/*   if (TGT_flag == 1)  */
+/*     print_sigma_inv(N,sigma,comm,arch); */
+/*   else */
+/*   print_sigma(N,sigma,comm,arch); */
+
+/*   CLOCK(time0); */
+/*   map_MPIPP(topology,5,N,sigma,comm,arch); */
+/*   CLOCK(time1); */
+/*   duration=CLOCK_DIFF(time1,time0); */
+/*   printf("MPIPP-5-D:%f\n",duration); */
+/*   printf("MPIPP-5: "); */
+/*   if (TGT_flag == 1)  */
+/*     print_sigma_inv(N,sigma,comm,arch); */
+/*   else */
+/*   print_sigma(N,sigma,comm,arch); */
+
+  FREE(sigma);
+}
+
+
+int in_tab(int *tab, int n, int val){
+  int i;
+  for( i = 0; i < n ; i++)
+    if(tab[i] == val)
+      return 1;
+
+  return 0;
+}
+
+void map_Packed(tm_topology_t *topology, int N, int *sigma)
+{
+  size_t i;
+  int j = 0,depth;
+  int vl = tm_get_verbose_level();
+
+  depth = topology->nb_levels-1;
+
+  for( i = 0 ; i < topology->nb_nodes[depth] ; i++){
+    /* printf ("%d -> %d\n",objs[i]->os_index,i); */
+    if((!topology->constraints) || (in_tab(topology->constraints, topology->nb_constraints, topology->node_id[depth][i]))){
+      if(vl >= DEBUG)
+	printf ("%lu: %d -> %d\n", i, j, topology->node_id[depth][i]);
+      sigma[j++]=topology->node_id[depth][i];
+      if(j == N)
+	break;
+    }
+  }
+}
+
+void map_RR(tm_topology_t *topology, int N,int *sigma)
+{
+  int i;
+  int vl = tm_get_verbose_level();
+
+  for( i = 0 ; i < N ; i++ ){
+    if(topology->constraints)
+      sigma[i]=topology->constraints[i%topology->nb_constraints];
+    else
+      sigma[i]=i%topology->nb_proc_units;
+    if(vl >= DEBUG)
+      printf ("%d -> %d (%d)\n",i,sigma[i],topology->nb_proc_units);
+  }
+}
+
+int hash_asc(const void* x1,const void* x2)
+{
+  hash_t *e1 = NULL,*e2 = NULL;
+
+  e1 = ((hash_t*)x1);
+  e2 = ((hash_t*)x2);
+
+  return (e1->key < e2->key) ? -1 : 1;
+}
+
+
+int *generate_random_sol(tm_topology_t *topology,int N,int level,int seed)
+{
+  hash_t *hash_tab = NULL;
+  int *sol = NULL;
+  int *nodes_id= NULL;
+  int i;
+
+  nodes_id = topology->node_id[level];
+
+  hash_tab = (hash_t*)MALLOC(sizeof(hash_t)*N);
+  sol = (int*)MALLOC(sizeof(int)*N);
+
+  init_genrand(seed);
+
+  for( i = 0 ; i < N ; i++ ){
+    hash_tab[i].val = nodes_id[i];
+    hash_tab[i].key = genrand_int32();
+  }
+
+  qsort(hash_tab,N,sizeof(hash_t),hash_asc);
+  for( i = 0 ; i < N ; i++ )
+    sol[i] = hash_tab[i].val;
+
+  FREE(hash_tab);
+  return sol;
+}
+
+
+double eval_sol(int *sol,int N,double **comm, double **arch)
+{
+  double a,c,res;
+  int i,j;
+
+  res = 0;
+  for ( i = 0 ; i < N ; i++ )
+    for ( j = i+1 ; j < N ; j++ ){
+      c = comm[i][j];
+      a = arch[sol[i]][sol[j]];
+      res += c/a;
+    }
+
+  return res;
+}
+
+void exchange(int *sol,int i,int j)
+{
+  int tmp;
+  tmp = sol[i];
+  sol[i] = sol[j];
+  sol[j] = tmp;
+}
+
+double gain_exchange(int *sol,int l,int m,double eval1,int N,double **comm, double **arch)
+{
+  double eval2;
+  if( l == m )
+    return 0;
+  exchange(sol,l,m);
+  eval2 = eval_sol(sol,N,comm,arch);
+  exchange(sol,l,m);
+
+  return eval1-eval2;
+}
+
+void select_max(int *l,int *m,double **gain,int N,int *state)
+{
+  double max;
+  int i,j;
+
+  max = -DBL_MAX;
+
+  for( i = 0 ; i < N ; i++ )
+    if(!state[i])
+      for( j = 0 ; j < N ; j++ )
+	if( (i != j) && (!state[j]) ){
+	  if(gain[i][j] > max){
+	    *l = i;
+	    *m = j;
+	    max=gain[i][j];
+	  }
+	}
+}
+
+
+void compute_gain(int *sol,int N,double **gain,double **comm, double **arch)
+{
+  double eval1;
+  int i,j;
+
+  eval1 = eval_sol(sol,N,comm,arch);
+  for( i = 0 ; i < N ; i++ )
+    for( j = 0 ; j <= i ; j++)
+      gain[i][j] = gain[j][i] = gain_exchange(sol,i,j,eval1,N,comm,arch);
+}
+
+
+/* Randomized Algorithm of
+Hu Chen, Wenguang Chen, Jian Huang ,Bob Robert,and H.Kuhn. Mpipp: an automatic profile-guided
+parallel process placement toolset for smp clusters and multiclusters. In
+Gregory K. Egan and Yoichi Muraoka, editors, ICS, pages 353-360. ACM, 2006.
+ */
+
+void map_MPIPP(tm_topology_t *topology,int nb_seed,int N,int *sigma,double **comm, double **arch)
+{
+  int *sol = NULL;
+  int *state = NULL;
+  double **gain = NULL;
+  int **history = NULL;
+  double *temp = NULL;
+  int i,j,t,l=0,m=0,seed=0;
+  double max,sum,best_eval,eval;
+
+  gain = (double**)MALLOC(sizeof(double*)*N);
+  history = (int**)MALLOC(sizeof(int*)*N);
+  for( i = 0 ; i < N ; i++){
+    gain[i] = (double*)MALLOC(sizeof(double)*N);
+    history[i] = (int*)MALLOC(sizeof(int)*3);
+  }
+
+  state = (int*)MALLOC(sizeof(int)*N);
+  temp = (double*)MALLOC(sizeof(double)*N);
+
+  sol = generate_random_sol(topology,N,topology->nb_levels-1,seed++);
+  for( i = 0 ; i < N ; i++)
+    sigma[i] = sol[i];
+
+  best_eval = DBL_MAX;
+  while(seed <= nb_seed){
+    do{
+      for( i =  0 ; i < N ; i++ ){
+	state[i] = 0;
+	/* printf("%d ",sol[i]); */
+      }
+      /* printf("\n"); */
+      compute_gain(sol,N,gain,comm,arch);
+      /*
+      display_tab(gain,N);
+      exit(-1);
+      */
+      for( i = 0 ; i < N/2 ; i++ ){
+	select_max(&l,&m,gain,N,state);
+	/* printf("%d: %d <=> %d : %f\n",i,l,m,gain[l][m]); */
+	state[l] = 1;
+	state[m] = 1;
+	exchange(sol,l,m);
+	history[i][1] = l;
+	history[i][2] = m;
+	temp[i] = gain[l][m];
+	compute_gain(sol,N,gain,comm,arch);
+      }
+
+      t = -1;
+      max = 0;
+      sum = 0;
+      for(i = 0 ; i < N/2 ; i++ ){
+	sum += temp[i];
+	if( sum > max ){
+	  max = sum;
+	  t = i;
+	}
+      }
+      /*for(j=0;j<=t;j++)
+	printf("exchanging: %d with %d for gain: %f\n",history[j][1],history[j][2],temp[j]); */
+      for( j = t+1 ; j < N/2 ; j++ ){
+	exchange(sol,history[j][1],history[j][2]);
+	/* printf("Undoing: %d with %d for gain: %f\n",history[j][1],history[j][2],temp[j]);  */
+      }
+      /* printf("max=%f\n",max); */
+
+      /*for(i=0;i<N;i++){
+	printf("%d ",sol[i]);
+	}
+	printf("\n");*/
+      eval = eval_sol(sol,N,comm,arch);
+      if(eval < best_eval){
+	best_eval = eval;
+	for(i = 0 ; i < N ; i++)
+	  sigma[i] = sol[i];
+	/* print_sol(N); */
+      }
+    }while( max > 0 );
+    FREE(sol);
+    sol=generate_random_sol(topology,N,topology->nb_levels-1,seed++);
+  }
+
+
+  FREE(sol);
+  FREE(temp);
+  FREE(state);
+  for( i = 0 ; i < N ; i++){
+    FREE(gain[i]);
+    FREE(history[i]);
+  }
+  FREE(gain);
+  FREE(history);
+}
--- a/ompi/mca/topo/treematch/treematch/tm_solution.h
+++ b/ompi/mca/topo/treematch/treematch/tm_solution.h
@ -0,0 +1,26 @@
+#ifndef TM_SOLUION_H
+#define TM_SOLUION_H
+
+#include "treematch.h"
+
+void tm_free_solution(tm_solution_t *sol);
+int distance(tm_topology_t *topology,int i, int j);
+double display_sol_sum_com(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, int *sigma);
+  double display_sol(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, int *sigma, tm_metric_t metric);
+double tm_display_solution(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, tm_solution_t *sol,
+			   tm_metric_t metric);
+void tm_display_other_heuristics(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, tm_metric_t metric);
+int in_tab(int *tab, int n, int val);
+void map_Packed(tm_topology_t *topology, int N, int *sigma);
+void map_RR(tm_topology_t *topology, int N, int *sigma);
+int hash_asc(const void* x1,const void* x2);
+int *generate_random_sol(tm_topology_t *topology,int N,int level,int seed);
+double eval_sol(int *sol,int N,double **comm, double **arch);
+void exchange(int *sol,int i,int j);
+double gain_exchange(int *sol,int l,int m,double eval1,int N,double **comm, double **arch);
+void select_max(int *l,int *m,double **gain,int N,int *state);
+void compute_gain(int *sol,int N,double **gain,double **comm, double **arch);
+void map_MPIPP(tm_topology_t *topology,int nb_seed,int N,int *sigma,double **comm, double **arch);
+
+
+#endif
--- a/ompi/mca/topo/treematch/treematch/tm_thread_pool.c
+++ b/ompi/mca/topo/treematch/treematch/tm_thread_pool.c
@ -1,13 +1,18 @@
 #include <pthread.h>
 #include "tm_thread_pool.h"
 #include "tm_verbose.h"
-#include "opal/mca/hwloc/hwloc-internal.h"
+#include <hwloc.h>
 #include "tm_verbose.h"
 #include "tm_tree.h"
 #include <errno.h>
+#include <limits.h>

+typedef enum _mapping_policy {COMPACT, SCATTER} mapping_policy_t;
+
+static mapping_policy_t mapping_policy = COMPACT;
 static int verbose_level = ERROR;
 static thread_pool_t *pool = NULL;
+static unsigned int max_nb_threads = INT_MAX;

 static thread_pool_t *get_thread_pool(void);
 static void execute_work(work_t *work);
@ -16,39 +21,21 @@ static void *thread_loop(void *arg);
 static void add_work(pthread_mutex_t *list_lock, pthread_cond_t *cond_var, work_t *working_list, work_t *work);
 static thread_pool_t *create_threads(void);

-static void f1 (int nb_args, void **args);
-static void f2 (int nb_args, void **args);
+static void f1 (int nb_args, void **args, int thread_id);
+static void f2 (int nb_args, void **args, int thread_id);
 static void destroy_work(work_t *work);

+#define MIN(a, b) ((a)<(b)?(a):(b))
+#define MAX(a, b) ((a)>(b)?(a):(b))

-void f1 (int nb_args, void **args){
-  int a, b;
-  a = *(int*)args[0];
-  b = *(int*)args[1];
-  printf("nb_args=%d, a=%d, b=%d\n",nb_args,a,b);
+
+
+void tm_set_max_nb_threads(unsigned int val){
+  max_nb_threads = val;
 }

-
-void f2 (int nb_args, void **args){
-  int n, *tab;
-  int *res;
-  int i,j;
-  n = *(int*)args[0];
-  tab = (int*)args[1];
-  res=(int*)args[2];
-
-  for(j=0;j<1000000;j++){
-    *res=0;
-    for (i=0;i<n;i++)
-      *res+=tab[i];
-  }
-
-  printf("done: %d!\n",nb_args);
-}
-
-
 void execute_work(work_t *work){
-  work->task(work->nb_args, work->args);
+  work->task(work->nb_args, work->args, work->thread_id);
 }

 int bind_myself_to_core(hwloc_topology_t topology, int id){
@ -57,10 +44,29 @@ int bind_myself_to_core(hwloc_topology_t topology, int id){
  char *str;
  int binding_res;
  int depth = hwloc_topology_get_depth(topology);
+  int nb_cores = hwloc_get_nbobjs_by_depth(topology, depth-1);
+  int my_core;
+  int nb_threads = get_nb_threads();
  /* printf("depth=%d\n",depth); */

+  switch (mapping_policy){
+  case SCATTER:
+    my_core = id*(nb_cores/nb_threads);
+    break;
+  default:
+    if(verbose_level>=WARNING){
+      printf("Wrong scheduling policy. Using COMPACT\n");
+    }
+  case COMPACT:
+    my_core = id%nb_cores;
+  }
+
+    if(verbose_level>=INFO){
+       printf("Mapping thread %d on core %d\n",id,my_core);
+   }
+
    /* Get my core. */
-    obj = hwloc_get_obj_by_depth(topology, depth-1, id);
+    obj = hwloc_get_obj_by_depth(topology, depth-1, my_core);
    if (obj) {
      /* Get a copy of its cpuset that we may modify. */
      cpuset = hwloc_bitmap_dup(obj->cpuset);
@ -71,7 +77,7 @@ int bind_myself_to_core(hwloc_topology_t topology, int id){


      /*hwloc_bitmap_asprintf(&str, cpuset);
-      printf("Binding thread %d to cpuset %s\n", id,str);
+      printf("Binding thread %d to cpuset %s\n", my_core,str);
      FREE(str);
      */

@ -81,8 +87,8 @@ int bind_myself_to_core(hwloc_topology_t topology, int id){
 	int error = errno;
 	hwloc_bitmap_asprintf(&str, obj->cpuset);
 	if(verbose_level>=WARNING)
-	  fprintf(stderr,"%d Couldn't bind to cpuset %s: %s\n", id, str, strerror(error));
-	FREE(str);
+	  printf("Thread %d couldn't bind to cpuset %s: %s.\n This thread is not bound to any core...\n", my_core, str, strerror(error));
+	free(str); /* str is allocated by hlwoc, free it normally*/
 	return 0;
      }
      /* FREE our cpuset copy */
@ -90,7 +96,7 @@ int bind_myself_to_core(hwloc_topology_t topology, int id){
      return 1;
    }else{
      if(verbose_level>=WARNING)
-	fprintf(stderr,"No valid object for core id %d!\n",id);
+	printf("No valid object for core id %d!\n",my_core);
      return 0;
    }
 }
@ -161,6 +167,7 @@ void wait_work_completion(work_t *work){

 int submit_work(work_t *work, int thread_id){
  if( (thread_id>=0) && (thread_id< pool->nb_threads)){
+    work->thread_id = thread_id;
    add_work(&pool->list_lock[thread_id], &pool->cond_var[thread_id], &pool->working_list[thread_id], work);
    return 1;
  }
@ -171,11 +178,11 @@ thread_pool_t *create_threads(){
  hwloc_topology_t topology;
  int i;
  local_thread_t *local;
-  int nb_cores;
+  int nb_threads;
+  unsigned int nb_cores;
  int depth;

-  verbose_level = get_verbose_level();
-
+  verbose_level = tm_get_verbose_level();

    /*Get number of cores: set 1 thread per core*/
  /* Allocate and initialize topology object. */
@ -187,7 +194,7 @@ thread_pool_t *create_threads(){
  depth = hwloc_topology_get_depth(topology);
  if (depth == -1 ) {
    if(verbose_level>=CRITICAL)
-      fprintf(stderr,"Error: topology with unknown depth\n");
+      fprintf(stderr,"Error: HWLOC unable to find the depth of the topology of this node!\n");
    exit(-1);
  }

@ -195,19 +202,23 @@ thread_pool_t *create_threads(){

  /* at depth 'depth' it is necessary a PU/core where we can execute things*/
  nb_cores = hwloc_get_nbobjs_by_depth(topology, depth-1);
+  nb_threads = MIN(nb_cores,  max_nb_threads);
+
+  if(verbose_level>=INFO)
+    printf("nb_threads = %d\n",nb_threads);

  pool = (thread_pool_t*) MALLOC(sizeof(thread_pool_t));
  pool -> topology = topology;
-  pool -> nb_threads = nb_cores;
-  pool -> thread_list = (pthread_t*)MALLOC(sizeof(pthread_t)*nb_cores);
-  pool -> working_list = (work_t*)CALLOC(nb_cores,sizeof(work_t));
-  pool -> cond_var = (pthread_cond_t*)MALLOC(sizeof(pthread_cond_t)*nb_cores);
-  pool -> list_lock = (pthread_mutex_t*)MALLOC(sizeof(pthread_mutex_t)*nb_cores);
+  pool -> nb_threads = nb_threads;
+  pool -> thread_list = (pthread_t*)MALLOC(sizeof(pthread_t)*nb_threads);
+  pool -> working_list = (work_t*)CALLOC(nb_threads,sizeof(work_t));
+  pool -> cond_var = (pthread_cond_t*)MALLOC(sizeof(pthread_cond_t)*nb_threads);
+  pool -> list_lock = (pthread_mutex_t*)MALLOC(sizeof(pthread_mutex_t)*nb_threads);

-  local=(local_thread_t*)MALLOC(sizeof(local_thread_t)*nb_cores);
+  local=(local_thread_t*)MALLOC(sizeof(local_thread_t)*nb_threads);
  pool->local = local;

-  for (i=0;i<nb_cores;i++){
+  for (i=0;i<nb_threads;i++){
    local[i].topology = topology;
    local[i].id = i;
    local[i].working_list = &pool->working_list[i];
@ -245,11 +256,12 @@ void terminate_thread_pool(){

    for (id=0;id<pool->nb_threads;id++){
      pthread_join(pool->thread_list[id],(void **) &ret);
+      FREE(ret);
      pthread_cond_destroy(pool->cond_var +id);
      pthread_mutex_destroy(pool->list_lock +id);
      if (pool->working_list[id].next != NULL)
 	if(verbose_level >= WARNING)
-	  fprintf(stderr,"Working list of thread %d not empty!\n",id);
+	  printf("Working list of thread %d not empty!\n",id);
    }

    hwloc_topology_destroy(pool->topology);
@ -272,7 +284,7 @@ int get_nb_threads(){
 }


-work_t *create_work(int nb_args, void **args, void (*task) (int, void **)){
+work_t *create_work(int nb_args, void **args, void (*task) (int, void **, int)){
  work_t *work;
  work = MALLOC(sizeof(work_t));
  work -> nb_args = nb_args;
@ -293,6 +305,34 @@ void destroy_work(work_t *work){
  FREE(work);
 }

+/* CODE example 2 functions  and  test driver*/
+
+void f1 (int nb_args, void **args, int thread_id){
+  int a, b;
+  a = *(int*)args[0];
+  b = *(int*)args[1];
+  printf("id: %d, nb_args=%d, a=%d, b=%d\n",thread_id, nb_args,a,b);
+}
+
+
+void f2 (int nb_args, void **args, int thread_id){
+  int n, *tab;
+  int *res;
+  int i,j;
+  n = *(int*)args[0];
+  tab = (int*)args[1];
+  res=(int*)args[2];
+
+  for(j=0;j<1000000;j++){
+    *res=0;
+    for (i=0;i<n;i++)
+      *res+=tab[i];
+  }
+
+  printf("id: %d, done: %d!\n",thread_id, nb_args);
+}
+
+

 int test_main(void){

--- a/ompi/mca/topo/treematch/treematch/tm_thread_pool.h
+++ b/ompi/mca/topo/treematch/treematch/tm_thread_pool.h
@ -2,17 +2,18 @@
 #define THREAD_POOL_H

 #include <pthread.h>
-#include "opal/mca/hwloc/hwloc-internal.h"
+#include <hwloc.h>


 typedef struct _work_t{
  int nb_args;
-  void (*task)(int nb_args, void **args);
+  void (*task)(int nb_args, void **args, int thread_id);
  void **args;
  struct _work_t *next;
  pthread_cond_t work_done;
  pthread_mutex_t mutex;
  int done;
+  int thread_id;
 }work_t;

 typedef struct {
@ -38,8 +39,10 @@ int get_nb_threads(void);
 int submit_work(work_t *work, int thread_id);
 void wait_work_completion(work_t *work);
 void terminate_thread_pool(void);
-work_t *create_work(int nb_args, void **args, void (int, void **));
+work_t *create_work(int nb_args, void **args, void (int, void **, int));
 int test_main(void);


+
+
 #endif /* THREAD_POOL_H */
--- a/ompi/mca/topo/treematch/treematch/tm_timings.c
+++ b/ompi/mca/topo/treematch/treematch/tm_timings.c
@ -12,6 +12,7 @@ void get_time(void)

  CLOCK(time_tab[clock_num]);
 }
+
 double time_diff(void)
 {
  CLOCK_T t2,t1;
@ -22,7 +23,7 @@ double time_diff(void)
  }

  if(clock_num < 0){
-    return -1.0;
+    return -2.0;
  }

  CLOCK(t2);
--- a/ompi/mca/topo/treematch/treematch/tm_timings.h
+++ b/ompi/mca/topo/treematch/treematch/tm_timings.h
@ -1,4 +1,3 @@
-
 #ifndef TIMINGS_H
 #define TIMINGS_H
 #include <stdio.h>
--- a/ompi/mca/topo/treematch/treematch/tm_topology.c
+++ b/ompi/mca/topo/treematch/treematch/tm_topology.c
@ -0,0 +1,842 @@
+#include <hwloc.h>
+#include <hwloc/helper.h>
+#include "tm_tree.h"
+#include "tm_mapping.h"
+#include <ctype.h>
+#include "tm_verbose.h"
+#include "tm_solution.h"
+
+
+tm_topology_t* get_local_topo_with_hwloc(void);
+tm_topology_t* hwloc_to_tm(char *filename);
+int int_cmp_inc(const void* x1,const void* x2);
+void optimize_arity(int **arity, double **cost, int *nb_levels,int n);
+int symetric(hwloc_topology_t topology);
+tm_topology_t * tgt_to_tm(char *filename);
+void tm_display_arity(tm_topology_t *topology);
+void tm_display_topology(tm_topology_t *topology);
+void tm_free_topology(tm_topology_t *topology);
+tm_topology_t *tm_load_topology(char *arch_filename, tm_file_type_t arch_file_type);
+void tm_optimize_topology(tm_topology_t **topology);
+int  tm_topology_add_binding_constraints(char *constraints_filename, tm_topology_t *topology);
+int topo_nb_proc(hwloc_topology_t topology,int N);
+void topology_arity_cpy(tm_topology_t *topology,int **arity,int *nb_levels);
+void topology_constraints_cpy(tm_topology_t *topology,int **constraints,int *nb_constraints);
+void topology_cost_cpy(tm_topology_t *topology,double **cost);
+void topology_numbering_cpy(tm_topology_t *topology,int **numbering,int *nb_nodes);
+double ** topology_to_arch(hwloc_topology_t topology);
+void   build_synthetic_proc_id(tm_topology_t *topology);
+tm_topology_t  *tm_build_synthetic_topology(int *arity, double *cost, int nb_levels, int *core_numbering, int nb_core_per_nodes);
+
+
+#define LINE_SIZE (1000000)
+
+
+/* transform a tgt scotch file into a topology file*/
+tm_topology_t * tgt_to_tm(char *filename)
+{
+  tm_topology_t *topology = NULL;
+  FILE *pf = NULL;
+  char line[1024];
+  char *s = NULL;
+  double *cost = NULL;
+  int i;
+
+
+
+  pf = fopen(filename,"r");
+  if(!pf){
+    if(tm_get_verbose_level() >= CRITICAL)
+      fprintf(stderr,"Cannot open %s\n",filename);
+    exit(-1);
+  }
+
+  if(tm_get_verbose_level() >= INFO)
+    printf("Reading TGT file: %s\n",filename);
+
+
+  fgets(line,1024,pf);
+  fclose(pf);
+
+  s = strstr(line,"tleaf");
+  if(!s){
+    if(tm_get_verbose_level() >= CRITICAL)
+      fprintf(stderr,"Syntax error! %s is not a tleaf file\n",filename);
+    exit(-1);
+  }
+
+  s += 5;
+  while(isspace(*s))
+    s++;
+
+  topology                 = (tm_topology_t*)MALLOC(sizeof(tm_topology_t));
+  topology->nb_constraints = 0;
+  topology->oversub_fact   = 1;
+  topology->constraints    = NULL;
+  topology->nb_levels      = atoi(strtok(s," "))+1;
+  topology->arity          = (int*)MALLOC(sizeof(int)*topology->nb_levels);
+
+  cost = (double*)CALLOC(topology->nb_levels,sizeof(double));
+
+  for( i = 0 ; i < topology->nb_levels-1 ; i++ ){
+    topology->arity[i] = atoi(strtok(NULL," "));
+    cost[i] = atoi(strtok(NULL," "));
+  }
+
+  topology->arity[topology->nb_levels-1] = 0;
+  /* cost[topology->nb_levels-1]=0; */
+
+  /*aggregate costs*/
+  for( i = topology->nb_levels-2 ; i >= 0 ; i-- )
+    cost[i] += cost[i+1];
+
+  build_synthetic_proc_id(topology);
+
+  if(tm_get_verbose_level() >= INFO)
+    printf("Topology built from %s!\n",filename);
+
+  topology->cost=cost;
+
+
+  return topology;
+}
+
+int topo_nb_proc(hwloc_topology_t topology,int N)
+{
+  hwloc_obj_t *objs = NULL;
+  int nb_proc;
+
+  objs = (hwloc_obj_t*)MALLOC(sizeof(hwloc_obj_t)*N);
+  objs[0] = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU,NULL);
+  nb_proc = 1 + hwloc_get_closest_objs(topology,objs[0],objs+1,N-1);
+  FREE(objs);
+  return nb_proc;
+}
+
+
+
+double link_cost(int depth)
+{
+  /*
+    Bertha values
+    double tab[5]={21,9,4.5,2.5,0.001};
+    double tab[5]={1,1,1,1,1};
+    double tab[6]={100000,10000,1000,500,100,10};
+  */
+  double tab[11] = {1024,512,256,128,64,32,16,8,4,2,1};
+
+  return tab[depth];
+  /*
+   return 10*log(depth+2);
+   return (depth+1);
+   return (long int)pow(100,depth);
+  */
+}
+
+
+double ** topology_to_arch(hwloc_topology_t topology)
+{
+  int nb_proc,i,j;
+  hwloc_obj_t obj_proc1,obj_proc2,obj_res;
+  double **arch = NULL;
+
+  nb_proc = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
+  arch = (double**)MALLOC(sizeof(double*)*nb_proc);
+  for( i = 0 ; i < nb_proc ; i++ ){
+    obj_proc1 = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,i);
+    arch[obj_proc1->os_index] = (double*)MALLOC(sizeof(double)*nb_proc);
+    for( j = 0 ; j < nb_proc ; j++ ){
+      obj_proc2 = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,j);
+      obj_res = hwloc_get_common_ancestor_obj(topology,obj_proc1,obj_proc2);
+      /* printf("arch[%d][%d] <- %ld\n",obj_proc1->os_index,obj_proc2->os_index,*((long int*)(obj_res->userdatab))); */
+      arch[obj_proc1->os_index][obj_proc2->os_index]=link_cost(obj_res->depth+1);
+    }
+  }
+  return arch;
+}
+
+int symetric(hwloc_topology_t topology)
+{
+   int depth,i,topodepth = hwloc_topology_get_depth(topology);
+   unsigned int arity;
+   hwloc_obj_t obj;
+   for ( depth = 0; depth < topodepth-1 ; depth++ ) {
+    int N = hwloc_get_nbobjs_by_depth(topology, depth);
+    obj = hwloc_get_next_obj_by_depth (topology,depth,NULL);
+    arity = obj->arity;
+
+    /* printf("Depth=%d, N=%d, Arity:%d\n",depth,N,arity); */
+    for (i = 1; i < N; i++ ){
+      obj = hwloc_get_next_obj_by_depth (topology,depth,obj);
+      if( obj->arity != arity){
+	/* printf("[%d]: obj->arity=%d, arity=%d\n",i,obj->arity,arity); */
+	return 0;
+      }
+    }
+   }
+   return 1;
+}
+
+tm_topology_t* hwloc_to_tm(char *filename)
+{
+  hwloc_topology_t topology;
+  tm_topology_t *res = NULL;
+  hwloc_obj_t *objs = NULL;
+  unsigned topodepth,depth;
+  unsigned int nb_nodes;
+  double *cost;
+  int err, l;
+  unsigned int i;
+  int vl = tm_get_verbose_level();
+
+  /* Build the topology */
+  hwloc_topology_init(&topology);
+  err = hwloc_topology_set_xml(topology,filename);
+  if(err == -1){
+    if(vl >= CRITICAL)
+      fprintf(stderr,"Error: %s is a bad xml topology file!\n",filename);
+    exit(-1);
+  }
+
+  hwloc_topology_ignore_all_keep_structure(topology);
+  hwloc_topology_load(topology);
+
+
+  /* Test if symetric */
+  if(!symetric(topology)){
+    if(tm_get_verbose_level() >= CRITICAL)
+      fprintf(stderr,"%s not symetric!\n",filename);
+    exit(-1);
+  }
+
+  /* work on depth */
+  topodepth = hwloc_topology_get_depth(topology);
+
+  res                   = (tm_topology_t*)MALLOC(sizeof(tm_topology_t));
+  res->oversub_fact      = 1;
+  res->nb_constraints   = 0;
+  res->constraints      = NULL;
+  res->nb_levels        = topodepth;
+  res->node_id          = (int**)MALLOC(sizeof(int*)*res->nb_levels);
+  res->node_rank        = (int**)MALLOC(sizeof(int*)*res->nb_levels);
+  res->nb_nodes         = (size_t*)MALLOC(sizeof(size_t)*res->nb_levels);
+  res->arity            = (int*)MALLOC(sizeof(int)*res->nb_levels);
+
+  if(vl >= INFO)
+      printf("topodepth = %d\n",topodepth);
+
+  /* Build TreeMatch topology */
+  for( depth = 0 ; depth < topodepth ; depth++ ){
+    nb_nodes = hwloc_get_nbobjs_by_depth(topology, depth);
+    res->nb_nodes[depth] = nb_nodes;
+    res->node_id[depth] = (int*)MALLOC(sizeof(int)*nb_nodes);
+    res->node_rank[depth] = (int*)MALLOC(sizeof(int)*nb_nodes);
+
+    objs = (hwloc_obj_t*)MALLOC(sizeof(hwloc_obj_t)*nb_nodes);
+    objs[0] = hwloc_get_next_obj_by_depth(topology,depth,NULL);
+    hwloc_get_closest_objs(topology,objs[0],objs+1,nb_nodes-1);
+    res->arity[depth] = objs[0]->arity;
+
+    if (depth == topodepth -1){
+      res->nb_constraints = nb_nodes;
+      res->nb_proc_units  = nb_nodes;
+    }
+
+    if(vl >= DEBUG)
+      printf("\n--%d(%d) **%d**:--\n",res->arity[depth],nb_nodes,res->arity[0]);
+
+    /* Build process id tab */
+    for (i = 0; i < nb_nodes; i++){
+      if(objs[i]->os_index > nb_nodes){
+	if(vl >= CRITICAL){
+	  fprintf(stderr, "Index of object %d of level %d is %d and larger than number of nodes : %d\n",
+		  i, depth, objs[i]->os_index, nb_nodes);
+	}
+	exit(-1);
+      }
+
+      res->node_id[depth][i] = objs[i]->os_index;
+      res->node_rank[depth][objs[i]->os_index] = i;
+      /* if(depth==topodepth-1) */
+    }
+    FREE(objs);
+
+
+  }
+
+  cost = (double*)CALLOC(res->nb_levels,sizeof(double));
+  for(l=0; l<res->nb_levels; l++){
+    cost[l] = link_cost(l);
+  }
+  res->cost = cost;
+
+
+  /* Destroy topology object. */
+  hwloc_topology_destroy(topology);
+  if(tm_get_verbose_level() >= INFO)
+    printf("\n");
+
+
+
+  return res;
+}
+
+tm_topology_t* get_local_topo_with_hwloc(void)
+{
+  hwloc_topology_t topology;
+  tm_topology_t *res = NULL;
+  hwloc_obj_t *objs = NULL;
+  unsigned topodepth,depth;
+  int nb_nodes,i;
+
+  /* Build the topology */
+  hwloc_topology_init(&topology);
+  hwloc_topology_ignore_all_keep_structure(topology);
+  hwloc_topology_load(topology);
+
+  /* Test if symetric */
+  if(!symetric(topology)){
+    if(tm_get_verbose_level() >= CRITICAL)
+      fprintf(stderr,"Local toplogy not symetric!\n");
+    exit(-1);
+  }
+
+  /* work on depth */
+  topodepth = hwloc_topology_get_depth(topology);
+
+  res                  = (tm_topology_t*)MALLOC(sizeof(tm_topology_t));
+  res->nb_constraints  = 0;
+  res->constraints     = NULL;
+  res->nb_levels       = topodepth;
+  res->node_id         = (int**)MALLOC(sizeof(int*)*res->nb_levels);
+  res->node_rank       = (int**)MALLOC(sizeof(int*)*res->nb_levels);
+  res->nb_nodes        = (size_t*)MALLOC(sizeof(size_t)*res->nb_levels);
+  res->arity           = (int*)MALLOC(sizeof(int)*res->nb_levels);
+
+  /* Build TreeMatch topology */
+  for( depth = 0 ; depth < topodepth ; depth++ ){
+    nb_nodes = hwloc_get_nbobjs_by_depth(topology, depth);
+    res->nb_nodes[depth] = nb_nodes;
+    res->node_id[depth] = (int*)MALLOC(sizeof(int)*nb_nodes);
+    res->node_rank[depth] = (int*)MALLOC(sizeof(int)*nb_nodes);
+
+    objs = (hwloc_obj_t*)MALLOC(sizeof(hwloc_obj_t)*nb_nodes);
+    objs[0] = hwloc_get_next_obj_by_depth(topology,depth,NULL);
+    hwloc_get_closest_objs(topology,objs[0],objs+1,nb_nodes-1);
+    res->arity[depth] = objs[0]->arity;
+
+    if (depth == topodepth -1){
+      res->nb_constraints = nb_nodes;
+      res->nb_proc_units = nb_nodes;
+    }
+    /* printf("%d:",res->arity[depth]); */
+
+    /* Build process id tab */
+    for (i = 0; i < nb_nodes; i++){
+      res->node_id[depth][i] = objs[i]->os_index;
+      res->node_rank[depth][objs[i]->os_index] = i;
+      /* if(depth==topodepth-1) */
+    }
+    FREE(objs);
+  }
+
+
+
+  /* Destroy HWLOC topology object. */
+  hwloc_topology_destroy(topology);
+
+  /* printf("\n"); */
+  return res;
+}
+
+
+void tm_free_topology(tm_topology_t *topology)
+{
+  int i;
+  for( i = 0 ; i < topology->nb_levels ; i++ ){
+    FREE(topology->node_id[i]);
+    FREE(topology->node_rank[i]);
+  }
+
+  FREE(topology->constraints);
+  FREE(topology->node_id);
+  FREE(topology->node_rank);
+  FREE(topology->nb_nodes);
+  FREE(topology->arity);
+  FREE(topology->cost);
+  FREE(topology);
+}
+
+tm_topology_t *tm_load_topology(char *arch_filename, tm_file_type_t arch_file_type){
+  switch(arch_file_type){
+  case   TM_FILE_TYPE_TGT:
+    return  tgt_to_tm(arch_filename);
+  case TM_FILE_TYPE_XML:
+    return hwloc_to_tm(arch_filename);
+  default:
+    if(tm_get_verbose_level() >= ERROR){
+      fprintf(stderr,"Error loading topology. Filetype %d unknown\n", arch_file_type);
+    }
+    exit(-1);
+  }
+}
+
+
+void tm_display_topology(tm_topology_t *topology)
+{
+  int i;
+  unsigned int j;
+  unsigned long  id;
+  for( i = 0 ; i < topology->nb_levels ; i++ ){
+    printf("%d: ",i);
+    for( j = 0 ; j < topology->nb_nodes[i] ; j++)
+      printf("%d ",topology->node_id[i][j]);
+    printf("\n");
+  }
+
+  printf("Last level: ");
+  for(id = 0; id < topology->nb_nodes[topology->nb_levels-1]/topology->oversub_fact; id++)
+    printf("%d ",topology->node_rank[topology->nb_levels-1][id]);
+  printf("\n");
+
+
+  if(topology->constraints){
+    printf("Constraints: ");
+    for(i = 0; i < topology->nb_constraints; i++)
+      printf("%d ",topology->constraints[i]);
+    printf("\n");
+  }
+
+  printf("\tnb_levels=%d\n\tnb_constraints=%d\n\toversub_fact=%d\n\tnb proc units=%d\n\n",
+	 topology->nb_levels, topology->nb_constraints, topology->oversub_fact, topology->nb_proc_units);
+
+}
+
+
+void tm_display_arity(tm_topology_t *topology){
+  int depth;
+  for(depth=0; depth < topology->nb_levels; depth++)
+    printf("%d(%lf): ",topology->arity[depth], topology->cost[depth]);
+
+  printf("\n");
+}
+
+int int_cmp_inc(const void* x1,const void* x2)
+{
+  return *((int *)x1) < *((int *)x2) ? -1 : 1;
+}
+
+
+int topo_check_constraints(tm_topology_t *topology){
+  int n = topology->nb_constraints;
+  int i;
+  int depth = topology->nb_levels-1;
+  for (i=0;i<n;i++){
+    if(!in_tab(topology->node_id[depth], topology->nb_nodes[depth], topology->constraints[i])){
+      if(tm_get_verbose_level() >= CRITICAL){
+	fprintf(stderr,"Error! Incompatible constraint with the topology: rank %d in the constraints is not a valid id of any nodes of the topology.\n",topology->constraints[i]);
+      }
+      return 0;
+    }
+  }
+  return 1;
+}
+
+
+
+
+/* cpy flag tells if we need to copy the array.
+   Set to 1 when called from the application level and 0 when called from inside the library*/
+int tm_topology_set_binding_constraints_cpy(int *constraints, int nb_constraints, tm_topology_t *topology, int cpy_flag){
+
+  topology -> nb_constraints = nb_constraints;
+  if(cpy_flag){
+    topology -> constraints    =  (int*)MALLOC(nb_constraints*sizeof(int));
+    memcpy(topology -> constraints, constraints, nb_constraints*sizeof(int));
+  }else{
+    topology -> constraints    = constraints;
+  }
+
+  return topo_check_constraints(topology);
+}
+
+int tm_topology_set_binding_constraints(int *constraints, int nb_constraints, tm_topology_t *topology){
+  return tm_topology_set_binding_constraints_cpy(constraints, nb_constraints, topology, 1);
+}
+
+int  tm_topology_add_binding_constraints(char *constraints_filename, tm_topology_t *topology)
+{
+  int *tab = NULL;
+  FILE *pf = NULL;
+  char  line[LINE_SIZE],*l = NULL;
+  char *ptr = NULL;
+  int i,n;
+  unsigned int vl = tm_get_verbose_level();
+
+
+  if (!(pf = fopen(constraints_filename,"r"))) {
+    if(vl >= CRITICAL)
+      fprintf(stderr,"Cannot open %s\n",constraints_filename);
+    exit(-1);
+  }
+
+  /* compute the size of the array to store the constraints*/
+  n = 0;
+  fgets(line, LINE_SIZE, pf);
+  l = line;
+  while((ptr=strtok(l," \t"))){
+    l = NULL;
+    if((ptr[0] != '\n') && ( !isspace(ptr[0])) && (*ptr) && (ptr))
+      n++;
+  }
+
+  tab = (int*)MALLOC(n*sizeof(int));
+
+  rewind(pf);
+  fgets(line, LINE_SIZE, pf);
+  fclose(pf);
+  l = line;
+  i = 0;
+  while((ptr=strtok(l," \t"))){
+    l = NULL;
+    if((ptr[0] != '\n') && ( !isspace(ptr[0])) && (*ptr) && (ptr)){
+      if(i < n)
+	tab[i] = atoi(ptr);
+      else{
+	if(vl >= CRITICAL)
+	  fprintf(stderr, "More than %d entries in %s\n", n, constraints_filename);
+	exit(-1);
+      }
+      i++;
+    }
+  }
+
+  if( i != n ){
+    if(vl >= CRITICAL)
+      fprintf(stderr, "Read %d entries while expecting %d ones\n", i, n);
+    exit(-1);
+  }
+
+  qsort(tab,n,sizeof(int),int_cmp_inc);
+
+  return tm_topology_set_binding_constraints_cpy(tab, n, topology, 0);
+}
+
+
+void topology_numbering_cpy(tm_topology_t *topology,int **numbering,int *nb_nodes)
+{
+  int nb_levels;
+  unsigned int vl = tm_get_verbose_level();
+
+  nb_levels = topology->nb_levels;
+  *nb_nodes = topology->nb_nodes[nb_levels-1];
+  if(vl >= INFO)
+    printf("nb_nodes=%d\n",*nb_nodes);
+  *numbering = (int*)MALLOC(sizeof(int)*(*nb_nodes));
+  memcpy(*numbering,topology->node_id[nb_levels-1],sizeof(int)*(*nb_nodes));
+}
+
+void topology_arity_cpy(tm_topology_t *topology,int **arity,int *nb_levels)
+{
+  *nb_levels = topology->nb_levels;
+  *arity = (int*)MALLOC(sizeof(int)*(*nb_levels));
+  memcpy(*arity,topology->arity,sizeof(int)*(*nb_levels));
+}
+
+void topology_constraints_cpy(tm_topology_t *topology,int **constraints,int *nb_constraints)
+{
+  *nb_constraints = topology->nb_constraints;
+  if(topology->constraints){
+    *constraints = (int*)MALLOC(sizeof(int)*(*nb_constraints));
+    memcpy(*constraints,topology->constraints,sizeof(int)*(*nb_constraints));
+  }else{
+    *constraints = NULL;
+  }
+}
+
+void topology_cost_cpy(tm_topology_t *topology,double **cost)
+{
+  *cost = (double*)MALLOC(sizeof(double)*(topology->nb_levels));
+  memcpy(*cost,topology->cost,sizeof(double)*(topology->nb_levels));
+}
+
+void optimize_arity(int **arity, double **cost, int *nb_levels,int n)
+{
+  int a,i;
+  int *new_arity = NULL;
+  double *new_cost = NULL;
+
+  if( n < 0 )
+    return;
+  /*   printf("n=%d\tnb_levels=%d\n",n,*nb_levels); */
+  /*   for(i=0;i<*nb_levels;i++) */
+  /*     printf("%d:",(*arity)[i]); */
+  /*   printf("\n");   */
+  /* if(n==(*nb_levels)-3) */
+  /*  exit(-1); */
+  a = (*arity)[n];
+  if( (a%3 == 0) && (a > 3) ){
+    /*
+    check if the arity of level n devides 3
+    If this is the case:
+    Add a level
+    */
+    (*nb_levels)++;
+    /* Build a new arity and cost arrays  */
+    new_arity = (int*)MALLOC(sizeof(int)*(*nb_levels));
+    new_cost  = (double*)MALLOC(sizeof(double)*(*nb_levels));
+    /*  Copy the begining if the old arrays */
+    for( i = 0 ; i < n ; i++){
+      new_arity[i] = (*arity)[i];
+      new_cost[i] = (*cost)[i];
+    }
+    /* set the nth level to arity 3  */
+    new_arity[n] = 3;
+    /* copy the cost to this level*/
+    new_cost[n] = (*cost)[n];;
+    /* printf("a=%d\n",a); */
+    /* Set the (n+1) level to arity a/3 */
+    new_arity[n+1] = a/3;
+    /*Dupliacte the cost as it is the same level originally*/
+    new_cost[n+1] = (*cost)[n];
+    /* Copy the end of the arrays */
+    for( i = n+2 ; i < *nb_levels ; i++){
+      new_arity[i] = (*arity)[i-1];
+      new_cost[i] = (*cost)[i-1];
+    }
+    FREE(*arity);
+    FREE(*cost);
+    /* if a/3 =3 then go to the next level */
+    if(new_arity[n+1] == 3)
+      optimize_arity(&new_arity,&new_cost,nb_levels,n);
+    else /* continue to this level (remember we just add a new level */
+      optimize_arity(&new_arity,&new_cost,nb_levels,n+1);
+    *arity=new_arity;
+    *cost=new_cost;
+  }else if( (a%2==0) && (a>2) ){/* same as above but for arity == 2 instead of 3 */
+    (*nb_levels)++;
+    new_arity = (int*)MALLOC(sizeof(int)*(*nb_levels));
+    new_cost  = (double*)MALLOC(sizeof(double)*(*nb_levels));
+    for( i = 0 ; i < n ; i++ ){
+      new_arity[i] = (*arity)[i];
+      new_cost[i] = (*cost)[i];
+    }
+    new_arity[n] = 2;
+    new_cost[n] = (*cost)[n];;
+    /* printf("a=%d\n",a); */
+    new_arity[n+1] = a/2;
+    new_cost[n+1] = (*cost)[n];
+    for( i = n+2 ; i < *nb_levels ; i++ ){
+      new_arity[i] = (*arity)[i-1];
+      new_cost[i] = (*cost)[i-1];
+    }
+   FREE(*arity);
+    FREE(*cost);
+    if(new_arity[n+1] == 2)
+      optimize_arity(&new_arity, &new_cost, nb_levels, n);
+    else
+      optimize_arity(&new_arity, &new_cost, nb_levels, n+1);
+    *arity = new_arity;
+    *cost= new_cost;
+  }else /* if nothing works go to next level.  */
+    optimize_arity(arity, cost, nb_levels,n-1);
+}
+
+
+
+
+void tm_optimize_topology(tm_topology_t **topology){
+  int *arity = NULL,nb_levels;
+  int *numbering = NULL,nb_nodes;
+  tm_topology_t *new_topo;
+  double *cost;
+  unsigned int vl = tm_get_verbose_level();
+  int *constraints = NULL, nb_constraints;
+  int i;
+
+  if(vl >= DEBUG)
+    tm_display_arity(*topology);
+
+  topology_arity_cpy(*topology,&arity,&nb_levels);
+  topology_numbering_cpy(*topology,&numbering,&nb_nodes);
+  topology_constraints_cpy(*topology,&constraints,&nb_constraints);
+  topology_cost_cpy(*topology,&cost);
+
+
+  optimize_arity(&arity,&cost,&nb_levels,nb_levels-2);
+  new_topo = tm_build_synthetic_topology(arity, NULL, nb_levels,numbering,nb_nodes);
+  new_topo->cost = cost;
+  new_topo->constraints    = constraints;
+  new_topo->nb_constraints = nb_constraints;
+  new_topo->nb_proc_units  = (*topology)->nb_proc_units;
+  new_topo->oversub_fact   = (*topology)->oversub_fact;
+
+
+
+  if(vl >= DEBUG){
+    if(constraints){
+      printf("Constraints: ");
+      for(i=0;i<nb_constraints;i++)
+	printf("%d - ",constraints[i]);
+      printf("\n");
+    }
+
+    tm_display_arity(new_topo);
+  }
+  FREE(arity);
+  FREE(numbering);
+  tm_free_topology(*topology);
+  
+  *topology = new_topo;
+  /*  exit(-1); */
+
+
+}
+
+
+
+/*
+   Build a synthetic balanced topology
+
+   arity : array of arity of the first nb_level (of size nb_levels)
+   cost : array of costs between the levels (of size nb_levels)
+   core_numbering: numbering of the core by the system. Array of size nb_core_per_node
+
+   nb_core_per_nodes: number of cores of a given node size of the array core_numbering
+
+   The numbering of the cores is done in round robin fashion after a width traversal of the topology.
+   for example:
+       {0,1,2,3} becomes 0,1,2,3,4,5,6,7...
+   and
+       {0,2,1,3} becomes 0,2,1,3,4,6,5,7,...
+ */
+
+tm_topology_t  *tm_build_synthetic_topology(int *arity, double *cost, int nb_levels, int *core_numbering, int nb_core_per_nodes)
+{
+  tm_topology_t *topology = NULL;
+  int i,j,n;
+
+
+  topology                 = (tm_topology_t*)MALLOC(sizeof(tm_topology_t));
+  topology->nb_constraints = 0;
+  topology->oversub_fact   = 1;
+  topology->constraints    = NULL;
+  topology->nb_levels      = nb_levels;
+  topology->arity          = (int*)MALLOC(sizeof(int)*topology->nb_levels);
+  topology->node_id        = (int**)MALLOC(sizeof(int*)*topology->nb_levels);
+  topology->node_rank      = (int**)MALLOC(sizeof(int*)*topology->nb_levels);
+  topology->nb_nodes       = (size_t *)MALLOC(sizeof(size_t)*topology->nb_levels);
+  if(cost)
+    topology->cost         = (double*)CALLOC(topology->nb_levels,sizeof(double));
+  else
+    topology->cost         = NULL;
+
+  memcpy(topology->arity, arity, sizeof(int)*nb_levels);
+  if(cost)
+    memcpy(topology->cost, cost, sizeof(double)*nb_levels);
+
+  n = 1;
+  for( i = 0 ; i < topology->nb_levels ; i++ ){
+    topology->nb_nodes[i] = n;
+    topology->node_id[i] = (int*)MALLOC(sizeof(int)*n);
+    topology->node_rank[i] = (int*)MALLOC(sizeof(int)*n);
+    if( i < topology->nb_levels-1){
+      for( j = 0 ; j < n ; j++ ){
+	topology->node_id[i][j] = j;
+	topology->node_rank[i][j]=j;
+      }
+    }else{
+      for( j = 0 ; j < n ; j++ ){
+	int id = core_numbering[j%nb_core_per_nodes] + (nb_core_per_nodes)*(j/nb_core_per_nodes);
+	topology->node_id[i][j] = id;
+	topology->node_rank[i][id] = j;
+      }
+    }
+
+
+    if (i == topology->nb_levels-1){
+      topology->nb_constraints = n;
+      topology->nb_proc_units = n;
+    }
+
+    n *= topology->arity[i];
+  }
+  if(cost){
+    /*aggregate costs*/
+    for( i = topology->nb_levels-2 ; i >= 0 ; i-- )
+      topology->cost[i] += topology->cost[i+1];
+  }
+
+  return topology;
+}
+
+
+void   build_synthetic_proc_id(tm_topology_t *topology)
+{
+  int i;
+  size_t j,n = 1;
+
+  topology->node_id   = (int**)MALLOC(sizeof(int*)*topology->nb_levels);
+  topology->node_rank = (int**)MALLOC(sizeof(int*)*topology->nb_levels);
+  topology->nb_nodes  = (size_t*) MALLOC(sizeof(size_t)*topology->nb_levels);
+
+  for( i = 0 ; i < topology->nb_levels ; i++ ){
+    /* printf("n= %lld, arity := %d\n",n, topology->arity[i]); */
+    topology->nb_nodes[i] = n;
+    topology->node_id[i] = (int*)MALLOC(sizeof(long int)*n);
+    topology->node_rank[i] = (int*)MALLOC(sizeof(long int)*n);
+    if ( !topology->node_id[i] ){
+      if(tm_get_verbose_level() >= CRITICAL)
+	fprintf(stderr,"Cannot allocate level %d (of size %ld) of the topology\n", i, (unsigned long int)n);
+      exit(-1);
+    }
+
+    if (i == topology->nb_levels-1){
+      topology->nb_constraints = n;
+      topology->nb_proc_units = n;
+    }
+
+
+
+    for( j = 0 ; j < n ; j++ ){
+      topology->node_id[i][j] = j;
+      topology->node_rank[i][j] = j;
+    }
+    n *= topology->arity[i];
+  }
+
+}
+
+
+
+void tm_enable_oversubscribing(tm_topology_t *topology, unsigned int oversub_fact){
+{
+  int i,j,n;
+
+  if(oversub_fact <=1)
+    return;
+
+  topology -> nb_levels ++;
+  topology -> arity        = (int*)    REALLOC(topology->arity, sizeof(int)*topology->nb_levels);
+  topology -> cost         = (double*) REALLOC(topology->cost, sizeof(double)*topology->nb_levels);
+  topology -> node_id      = (int**)   REALLOC(topology->node_id, sizeof(int*)*topology->nb_levels);
+  topology -> node_rank    = (int**)   REALLOC(topology->node_rank, sizeof(int*)*topology->nb_levels);
+  topology -> nb_nodes     = (size_t *)REALLOC(topology->nb_nodes, sizeof(size_t)*topology->nb_levels);
+  topology -> oversub_fact = oversub_fact;
+
+  i = topology->nb_levels - 1;
+  n = topology->nb_nodes[i-1] * oversub_fact;
+  topology->arity[i-1] = oversub_fact;
+  topology->cost[i-1] = 0;
+  topology->node_id[i] = (int*)MALLOC(sizeof(int)*n);
+  topology->node_rank[i] = (int*)MALLOC(sizeof(int)*n);
+  topology->nb_nodes[i] = n;
+
+  for( j = 0 ; j < n ; j++ ){
+    int id = topology->node_id[i-1][j/oversub_fact];
+    topology->node_id[i][j] = id;
+    topology->node_rank[i][id] = j;
+  }
+ }
+
+}
--- a/ompi/mca/topo/treematch/treematch/tm_topology.h
+++ b/ompi/mca/topo/treematch/treematch/tm_topology.h
@ -0,0 +1,22 @@
+#include <hwloc.h>
+#include "tm_tree.h"
+
+tm_topology_t* get_local_topo_with_hwloc(void);
+tm_topology_t* hwloc_to_tm(char *filename);
+int int_cmp_inc(const void* x1,const void* x2);
+void optimize_arity(int **arity, double **cost, int *nb_levels,int n);
+int symetric(hwloc_topology_t topology);
+tm_topology_t * tgt_to_tm(char *filename);
+void tm_display_arity(tm_topology_t *topology);
+void tm_display_topology(tm_topology_t *topology);
+void tm_free_topology(tm_topology_t *topology);
+tm_topology_t *tm_load_topology(char *arch_filename, tm_file_type_t arch_file_type);
+void tm_optimize_topology(tm_topology_t **topology);
+int  tm_topology_add_binding_constraints(char *constraints_filename, tm_topology_t *topology);
+int topo_nb_proc(hwloc_topology_t topology,int N);
+void topology_arity(tm_topology_t *topology,int **arity,int *nb_levels);
+void topology_constraints(tm_topology_t *topology,int **constraints,int *nb_constraints);
+void topology_cost(tm_topology_t *topology,double **cost);
+void topology_numbering(tm_topology_t *topology,int **numbering,int *nb_nodes);
+double ** topology_to_arch(hwloc_topology_t topology);
+
--- a/ompi/mca/topo/treematch/treematch/tm_tree.c
+++ b/ompi/mca/topo/treematch/treematch/tm_tree.c
--- a/ompi/mca/topo/treematch/treematch/tm_tree.h
+++ b/ompi/mca/topo/treematch/treematch/tm_tree.h
@ -1,69 +1,22 @@
-#ifndef __TREE_H__
-#define __TREE_H__
+#ifndef __TM_TREE_H__
+#define __TM_TREE_H__
 #include <stdlib.h>
+#include "treematch.h"

-
-typedef struct _node_info_t{
-  int submit_date;
-  int job_id;
-  int finish_date;
-} job_info_t;
-
-typedef struct _tree_t{
-  int constraint; /* tells if the tree has been constructed with constraints on the nodes or not.                    usefull for freeing it. needs to be set on the root only*/
-  struct _tree_t **child;
-  struct _tree_t *parent;
-  struct _tree_t *tab_child; /*the pointer to be freed*/
-  double val;
-  int arity;
-  int depth;
-  int id;
-  int uniq;
-  int dumb; /* 1 if the node belongs to a dumb tree: hence has to be freed separately*/
-  job_info_t *job_info;
-}tree_t;
-
-/* Maximum number of levels in the tree*/
-#define MAX_LEVELS 100
-
-typedef struct {
-  int *arity; /* arity of the nodes of each level*/
-  int nb_levels; /*number of levels of the tree. Levels are numbered from top to bottom starting at 0*/
-  int *nb_nodes; /*nb of nodes of each level*/
-  int *nb_free_nodes; /*nb of available nodes of each level*/
-  int **node_id;    /*ID of the nodes of the tree for each level*/
-  int **free_nodes; /*ID of the nodes of the tree for each level*/
-}tm_topology_t;
-
-
-typedef struct {
-  double ** mat;
-  double *  sum_row;
-  int order;
-} affinity_mat_t;
-
-
-
-tree_t * build_tree(double **tab,int N);
-tree_t * build_tree_from_topology(tm_topology_t *topology,double **tab,int N, double *obj_weight, double *comm_speed);
-void map_tree(tree_t *,tree_t*);
+void update_val(tm_affinity_mat_t *aff_mat,tm_tree_t *parent);
 void display_tab(double **tab,int N);
-double speed(int depth);
-void set_node(tree_t *node,tree_t ** child, int arity,tree_t *parent,int id,double val,tree_t *deb_tab_child, int depth);
-void free_constraint_tree(tree_t *tree);
-void free_tree(tree_t *tree);
-void free_tab_double(double**tab,int N);
-void free_tab_int(int**tab,int N);
-void update_val(affinity_mat_t *aff_mat,tree_t *parent);
-void FREE_tree(tree_t *tree);
-void FREE_tab_double(double**,int);
+void set_node(tm_tree_t *node,tm_tree_t ** child, int arity,tm_tree_t *parent,
+	      int id,double val,tm_tree_t *tab_child,int depth);
+

 typedef struct _group_list_t{
  struct _group_list_t *next;
-  tree_t **tab;
+  tm_tree_t **tab;
  double val;
  double sum_neighbour;
  double wg;
+  int id;
+  double *bound;
 }group_list_t;


@ -74,21 +27,13 @@ typedef struct{
 }adjacency_t;


-
-/* for debugging malloc */
-/* #define __DEBUG_MY_MALLOC__ */
-#undef __DEBUG_MY_MALLOC__
-#ifdef __DEBUG_MY_MALLOC__
-#include "tm_malloc.h"
-#define MALLOC(x) my_malloc(x,__FILE__,__LINE__)
-#define CALLOC(x,y) my_calloc(x,y,__FILE__,__LINE__)
-#define FREE   my_free
-#define MEM_CHECK my_mem_check
-#else
-#define MALLOC malloc
-#define CALLOC calloc
-#define FREE   free
-#define MEM_CHECK my_mem_check
-#endif
+typedef struct _work_unit_t{
+  int nb_groups;
+  int *tab_group;
+  int done;
+  int nb_work;
+  struct _work_unit_t *next;
+}work_unit_t;

 #endif
+
--- a/ompi/mca/topo/treematch/treematch/tm_verbose.c
+++ b/ompi/mca/topo/treematch/treematch/tm_verbose.c
@ -1,11 +1,34 @@
 #include "tm_verbose.h"
+#include <stdio.h>
 static unsigned int verbose_level = ERROR;
+static FILE *output = NULL;

-void set_verbose_level(unsigned int level){
+void tm_set_verbose_level(unsigned int level){
  verbose_level = level;
 }

-
-unsigned int get_verbose_level(){
+unsigned int tm_get_verbose_level(){
  return verbose_level;
 }
+
+int tm_open_verbose_file(char *filename){
+  output = fopen(filename,"w");
+  if(output == NULL)
+    return 0;
+  else
+    return 1;
+}
+
+int tm_close_verbose_file(void){
+  if(output != NULL)
+    return fclose(output);
+  
+  return 0;
+}
+
+FILE *tm_get_verbose_output(){
+  if(!output)
+    return stdout;
+  else
+    return output;
+}
--- a/ompi/mca/topo/treematch/treematch/tm_verbose.h
+++ b/ompi/mca/topo/treematch/treematch/tm_verbose.h
@ -1,11 +1,22 @@
+#include <stdio.h>
+
 #define NONE     0
+/* output in stderr*/
 #define CRITICAL 1
 #define ERROR    2
+/* output in stdout*/
 #define WARNING  3
-#define INFO     4
-#define DEBUG    5
-
-void         set_verbose_level(unsigned int level);
-unsigned int get_verbose_level(void);
+#define TIMING   4
+#define INFO     5
+#define DEBUG    6


+/* return 0 on errror and  1 on success */
+int          tm_open_verbose_file(char *filename);
+int          tm_close_verbose_file(void);
+void         tm_set_verbose_level(unsigned int level);
+unsigned int tm_get_verbose_level(void);
+FILE *       tm_get_verbose_output(void);
+
+#define tm_verbose_printf(level, ...) level <= tm_get_verbose_level()?fprintf(tm_get_verbose_output(),__VA_ARGS__):0
+
--- a/ompi/mca/topo/treematch/treematch/treematch.h
+++ b/ompi/mca/topo/treematch/treematch/treematch.h
@ -0,0 +1,188 @@
+#ifndef __TREEMATCH_H__
+#define __TREEMATCH_H__
+
+/* size_t definition */
+#include <stddef.h>
+#include "tm_verbose.h"
+
+/********* TreeMatch Public Enum **********/
+
+/*type of topology files that can be read*/
+typedef enum{
+  TM_FILE_TYPE_UNDEF,
+  TM_FILE_TYPE_XML,
+  TM_FILE_TYPE_TGT
+} tm_file_type_t;
+
+/* different metrics to evaluate the solution */
+typedef enum{
+  TM_METRIC_SUM_COM  = 1,
+  TM_METRIC_MAX_COM  = 2,
+  TM_METRIC_HOP_BYTE = 3
+} tm_metric_t;
+
+
+/********* TreeMatch Public Structures **********/
+
+typedef struct _job_info_t{
+  int submit_date;
+  int job_id;
+  int finish_date;
+} tm_job_info_t;
+
+typedef struct _tree_t{
+  int constraint; /* tells if the tree has been constructed with constraints on the nodes or not.
+		     Usefull for freeing it. needs to be set on the root only*/
+  struct _tree_t **child;
+  struct _tree_t *parent;
+  struct _tree_t *tab_child; /*the pointer to be freed*/
+  double val;
+  int arity;
+  int depth;
+  int id;
+  int uniq;
+  int dumb; /* 1 if the node belongs to a dumb tree: hence has to be freed separately*/
+  tm_job_info_t *job_info;
+  int nb_processes; /* number of grouped processes (i.e. the order of the affinity matrix). Set at the root only*/
+}tm_tree_t; /* FT : changer le nom : tm_grouap_hierachy_t ?*/
+
+/* Maximum number of levels in the tree*/
+#define TM_MAX_LEVELS 100
+
+typedef struct {
+  int *arity;         /* arity of the nodes of each level*/
+  int nb_levels;      /*number of levels of the tree. Levels are numbered from top to bottom starting at 0*/
+  size_t *nb_nodes;      /*nb of nodes of each level*/
+  int **node_id;      /*ID of the nodes of the tree for each level*/
+  int **node_rank ;   /*rank of the nodes of the tree for each level given its ID: this is the inverse tab of node_id*/
+  size_t *nb_free_nodes; /*nb of available nodes of each level*/
+  int **free_nodes;   /*tab of node that are free: useful to simulate batch scheduler*/
+  double *cost;       /*cost of the communication depending on the distance:
+			cost[i] is the cost for communicating at distance nb_levels-i*/
+  int *constraints;   /* array of constraints: id of the nodes where it is possible to map processes */
+  int nb_constraints; /* Size of the above array */
+  int oversub_fact;   /* maximum number of processes to be mapped on a given node */
+  int nb_proc_units;  /* the real number of units used for computation */
+}tm_topology_t;
+
+
+typedef struct {
+  double ** mat;
+  double *  sum_row;
+  int order;
+} tm_affinity_mat_t;
+
+/*
+ sigma_i is such that  process i is mapped on core sigma_i
+ k_i is such that core i exectutes process k_i_j (0<=j<<=oversubscribing factor - 1)
+
+ size of sigma is the number of processes (nb_objs)
+ size of k is the number of cores/nodes   (nb_compute_units)
+ size of k[i] is the number of process we can execute per nodes (1 if no oversubscribing)
+
+ We must have numbe of process<=number of cores
+
+ k[i] == NULL if no process is mapped on core i
+*/
+
+typedef struct {
+  int *sigma;
+  size_t sigma_length;
+  int **k;
+  size_t k_length;
+  int oversub_fact;
+}tm_solution_t;
+
+
+/************ TreeMatch Public API ************/
+
+/* load XML or TGT topology */
+tm_topology_t *tm_load_topology(char *arch_filename, tm_file_type_t arch_file_type);
+/*
+   Alternatively, build a synthetic balanced topology.
+
+   nb_levels : number of levels of the topology +1 (the last level must be of cost 0 and arity 0).
+   arity : array of arity of the first nb_level (of size nb_levels)
+   cost : array of costs between the levels (of size nb_levels)
+   core_numbering: numbering of the core by the system. Array of size nb_core_per_node
+
+   nb_core_per_nodes: number of cores of a given node. Size of the array core_numbering
+
+   both arity and cost are copied inside tm_build_synthetic_topology
+
+   The numbering of the cores is done in round robin fashion after a width traversal of the topology.
+   for example:
+       {0,1,2,3} becomes 0,1,2,3,4,5,6,7...
+   and
+       {0,2,1,3} becomes 0,2,1,3,4,6,5,7,...
+
+   Example of call to build the 128.tgt file: tleaf 4 16 500 2 100 2 50 2 10
+
+   double cost[5] = {500,100,50,10,0};
+   int arity[5] = {16,2,2,2,0};
+   int cn[5]={0,1};
+
+   topology = tm_build_synthetic_topology(arity,cost,5,cn,2);
+
+ */
+tm_topology_t  *tm_build_synthetic_topology(int *arity, double *cost, int nb_levels, int *core_numbering, int nb_core_per_nodes);
+/* load affinity matrix */
+tm_affinity_mat_t *tm_load_aff_mat(char *com_filename);
+/*
+   Alternativelly, build the affinity matrix from a array of array of matrix of size order by order
+   For performance reason mat is not copied.
+*/
+tm_affinity_mat_t * tm_build_affinity_mat(double **mat, int order);
+/* Add constraints to toplogy
+   Return 1 on success and 0  if the constari,ts id are not compatible withe nodes id */
+int tm_topology_add_binding_constraints(char *bind_filename, tm_topology_t *topology);
+/* Alternatively, set the constraints from an array.
+   Return 1 on success and 0  if the constari,ts id are not compatible withe nodes id
+
+   The array constraints is copied inside tm_topology_set_binding_constraints
+
+*/
+int tm_topology_set_binding_constraints(int *constraints, int nb_constraints, tm_topology_t *topology);
+/* display arity of the topology */
+void  tm_display_arity(tm_topology_t *topology);
+/* display the full topology */
+void  tm_display_topology(tm_topology_t *topology);
+/* Optimize the topology by decomposing arities */
+void tm_optimize_topology(tm_topology_t **topology);
+/* Manage oversubscribing */
+void tm_enable_oversubscribing(tm_topology_t *topology, unsigned int oversub_fact);
+/* core of the treematch: compute the solution tree */
+tm_tree_t *tm_build_tree_from_topology(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, double *obj_weight, double *com_speed);
+/* compute the mapping according to teh tree an dthe core numbering*/
+tm_solution_t *tm_compute_mapping(tm_topology_t *topology, tm_tree_t *comm_tree);
+/* display the solution*/
+double tm_display_solution(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, tm_solution_t *sol, tm_metric_t metric);
+/* display RR, packed, MPIPP*/
+void tm_display_other_heuristics(tm_topology_t *topology, tm_affinity_mat_t *aff_mat, tm_metric_t metric);
+/* free TM strutures*/
+void tm_free_topology(tm_topology_t *topology);
+void tm_free_tree(tm_tree_t *comm_tree);
+void tm_free_solution(tm_solution_t *sol);
+void tm_free_affinity_mat(tm_affinity_mat_t *aff_mat);
+/* manage verbosity of TM*/
+void tm_set_verbose_level(unsigned int level);
+unsigned int  tm_get_verbose_level(void);
+/* finalize treematch :check memory if necessary, and free internal variables (thread pool)*/
+void tm_finalize();
+
+/*
+Ask for exhaustive search: may be very long
+   new_val == 0 : no exhuative search
+   new_val != 0 : exhuative search
+*/
+void tm_set_exhaustive_search_flag(int new_val);
+int tm_get_exhaustive_search_flag();
+
+
+/* Setting the maximum number of threads you want to use in parallel parts of TreeMatch */
+void tm_set_max_nb_threads(unsigned int val);
+
+
+#include "tm_malloc.h"
+
+#endif
--- a/ompi/mca/topo/treematch/treematch/uthash.h
+++ b/ompi/mca/topo/treematch/treematch/uthash.h
@ -22,7 +22,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #ifndef UTHASH_H
-#define UTHASH_H
+#define UTHASH_H 

 #include <string.h>   /* memcmp,strlen */
 #include <stddef.h>   /* ptrdiff_t */
@ -49,7 +49,7 @@ do {
  char **_da_dst = (char**)(&(dst));                                             \
  *_da_dst = (char*)(src);                                                       \
 } while(0)
-#else
+#else 
 #define DECLTYPE_ASSIGN(dst,src)                                                 \
 do {                                                                             \
  (dst) = DECLTYPE(dst)(src);                                                    \
@ -121,9 +121,9 @@ do {
  HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1)))

 #else
-#define HASH_BLOOM_MAKE(tbl)
-#define HASH_BLOOM_FREE(tbl)
-#define HASH_BLOOM_ADD(tbl,hashv)
+#define HASH_BLOOM_MAKE(tbl) 
+#define HASH_BLOOM_FREE(tbl) 
+#define HASH_BLOOM_ADD(tbl,hashv) 
 #define HASH_BLOOM_TEST(tbl,hashv) (1)
 #endif

@ -148,7 +148,7 @@ do {

 #define HASH_ADD(hh,head,fieldname,keylen_in,add)                                \
        HASH_ADD_KEYPTR(hh,head,&((add)->fieldname),keylen_in,add)
-
+ 
 #define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add)                            \
 do {                                                                             \
 unsigned _ha_bkt;                                                               \
@ -300,10 +300,10 @@ do {
    }                                                                            \
 } while (0)
 #else
-#define HASH_FSCK(hh,head)
+#define HASH_FSCK(hh,head) 
 #endif

-/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to
+/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to 
 * the descriptor to which this macro is defined for tuning the hash function.
 * The app can #include <unistd.h> to get the prototype for write(2). */
 #ifdef HASH_EMIT_KEYS
@ -313,12 +313,12 @@ do {
    write(HASH_EMIT_KEYS, &_klen, sizeof(_klen));                                \
    write(HASH_EMIT_KEYS, keyptr, fieldlen);                                     \
 } while (0)
-#else
-#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)
+#else 
+#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)                    
 #endif

 /* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */
-#ifdef HASH_FUNCTION
+#ifdef HASH_FUNCTION 
 #define HASH_FCN HASH_FUNCTION
 #else
 #define HASH_FCN HASH_JEN
@ -335,7 +335,7 @@ do {
 } while (0)


-/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at
+/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at 
 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */
 #define HASH_SAX(key,keylen,num_bkts,hashv,bkt)                                  \
 do {                                                                             \
@ -356,7 +356,7 @@ do {
      hashv = (hashv * 16777619) ^ _hf_key[_fn_i];                               \
  bkt = hashv & (num_bkts-1);                                                    \
 } while(0);
-
+ 
 #define HASH_OAT(key,keylen,num_bkts,hashv,bkt)                                  \
 do {                                                                             \
  unsigned _ho_i;                                                                \
@ -485,14 +485,14 @@ do {
 #ifdef HASH_USING_NO_STRICT_ALIASING
 /* The MurmurHash exploits some CPU's (x86,x86_64) tolerance for unaligned reads.
 * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error.
- * MurmurHash uses the faster approach only on CPU's where we know it's safe.
+ * MurmurHash uses the faster approach only on CPU's where we know it's safe. 
 *
 * Note the preprocessor built-in defines can be emitted using:
 *
 *   gcc -m64 -dM -E - < /dev/null                  (on gcc)
 *   cc -## a.c (where a.c is a simple test file)   (Sun Studio)
 */
-#if (defined(__i386__) || defined(__x86_64__))
+#if (defined(__i386__) || defined(__x86_64__)) 
 #define MUR_GETBLOCK(p,i) p[i]
 #else /* non intel */
 #define MUR_PLUS0_ALIGNED(p) (((unsigned long)p & 0x3) == 0)
@ -562,7 +562,7 @@ do {                                                                   \
 #endif  /* HASH_USING_NO_STRICT_ALIASING */

 /* key comparison function; return 0 if keys equal */
-#define HASH_KEYCMP(a,b,len) memcmp(a,b,len)
+#define HASH_KEYCMP(a,b,len) memcmp(a,b,len) 

 /* iterate over items in a known bucket to find desired item */
 #define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out)                       \
@ -603,36 +603,36 @@ do {
    }                                                                            \
    if (hh_del->hh_next) {                                                       \
        hh_del->hh_next->hh_prev = hh_del->hh_prev;                              \
-    }
+    }                                                                

 /* Bucket expansion has the effect of doubling the number of buckets
 * and redistributing the items into the new buckets. Ideally the
 * items will distribute more or less evenly into the new buckets
 * (the extent to which this is true is a measure of the quality of
- * the hash function as it applies to the key domain).
- *
+ * the hash function as it applies to the key domain). 
+ * 
 * With the items distributed into more buckets, the chain length
 * (item count) in each bucket is reduced. Thus by expanding buckets
- * the hash keeps a bound on the chain length. This bounded chain
+ * the hash keeps a bound on the chain length. This bounded chain 
 * length is the essence of how a hash provides constant time lookup.
- *
+ * 
 * The calculation of tbl->ideal_chain_maxlen below deserves some
 * explanation. First, keep in mind that we're calculating the ideal
 * maximum chain length based on the *new* (doubled) bucket count.
 * In fractions this is just n/b (n=number of items,b=new num buckets).
- * Since the ideal chain length is an integer, we want to calculate
+ * Since the ideal chain length is an integer, we want to calculate 
 * ceil(n/b). We don't depend on floating point arithmetic in this
 * hash, so to calculate ceil(n/b) with integers we could write
- *
+ * 
 *      ceil(n/b) = (n/b) + ((n%b)?1:0)
- *
+ * 
 * and in fact a previous version of this hash did just that.
 * But now we have improved things a bit by recognizing that b is
 * always a power of two. We keep its base 2 log handy (call it lb),
 * so now we can write this with a bit shift and logical AND:
- *
+ * 
 *      ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0)
- *
+ * 
 */
 #define HASH_EXPAND_BUCKETS(tbl)                                                 \
 do {                                                                             \
@ -684,7 +684,7 @@ do {


 /* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */
-/* Note that HASH_SORT assumes the hash handle name to be hh.
+/* Note that HASH_SORT assumes the hash handle name to be hh. 
 * HASH_SRT was added to allow the hash handle name to be passed in. */
 #define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn)
 #define HASH_SRT(hh,head,cmpfcn)                                                 \
@ -766,10 +766,10 @@ do {
 }                                                                               \
 } while (0)

-/* This function selects items from one hash into another hash.
- * The end result is that the selected items have dual presence
- * in both hashes. There is no copy of the items made; rather
- * they are added into the new hash through a secondary hash
+/* This function selects items from one hash into another hash. 
+ * The end result is that the selected items have dual presence 
+ * in both hashes. There is no copy of the items made; rather 
+ * they are added into the new hash through a secondary hash 
 * hash handle that must be present in the structure. */
 #define HASH_SELECT(hh_dst, dst, hh_src, src, cond)                              \
 do {                                                                             \
@ -823,7 +823,7 @@ do {
 #ifdef NO_DECLTYPE
 #define HASH_ITER(hh,head,el,tmp)                                                \
 for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL);       \
-  el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL))
+  el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL)) 
 #else
 #define HASH_ITER(hh,head,el,tmp)                                                \
 for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL);                 \
@ -831,7 +831,7 @@ for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL);
 #endif

 /* obtain a count of items in the hash */
-#define HASH_COUNT(head) HASH_CNT(hh,head)
+#define HASH_COUNT(head) HASH_CNT(hh,head) 
 #define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0)

 typedef struct UT_hash_bucket {
@ -840,7 +840,7 @@ typedef struct UT_hash_bucket {

   /* expand_mult is normally set to 0. In this situation, the max chain length
    * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If
-    * the bucket's chain exceeds this length, bucket expansion is triggered).
+    * the bucket's chain exceeds this length, bucket expansion is triggered). 
    * However, setting expand_mult to a non-zero value delays bucket expansion
    * (that would be triggered by additions to this particular bucket)
    * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH.
@ -848,7 +848,7 @@ typedef struct UT_hash_bucket {
    * multiplier is to reduce bucket expansions, since they are expensive, in
    * situations where we know that a particular bucket tends to be overused.
    * It is better to let its chain length grow to a longer yet-still-bounded
-    * value, than to do an O(n) bucket expansion too often.
+    * value, than to do an O(n) bucket expansion too often. 
    */
   unsigned expand_mult;

@ -874,7 +874,7 @@ typedef struct UT_hash_table {
    * hash distribution; reaching them in a chain traversal takes >ideal steps */
   unsigned nonideal_items;

-   /* ineffective expands occur when a bucket doubling was performed, but
+   /* ineffective expands occur when a bucket doubling was performed, but 
    * afterward, more than half the items in the hash had nonideal chain
    * positions. If this happens on two consecutive expansions we inhibit any
    * further expansion, as it's not helping; this happens when the hash