Checkpoint:

- update the hierarch stuff to use btl's instead of ptl's - start the new logic regarding how to handle local leader communicators This commit was SVN r7691.
2005-10-11 17:29:59 +00:00 · 2005-10-11 17:29:59 +00:00 · b42d4ac780
--- a/ompi/mca/coll/hierarch/coll_hierarch.c
+++ b/ompi/mca/coll/hierarch/coll_hierarch.c
@ -21,19 +21,23 @@

 #include "mpi.h"
 #include "communicator/communicator.h"
+#include "proc/proc.h"
 #include "mca/coll/coll.h"
 #include "mca/coll/base/base.h"
 #include "coll_hierarch.h"

-#include "mca/ptl/ptl.h"
-#include "mca/pml/teg/src/pml_teg_proc.h"
-#include "mca/pml/teg/src/pml_teg_ptl.h"
+#include "class/ompi_bitmap.h"
+#include "mca/bml/bml.h"
+#include "mca/bml/base/base.h"
+#include "mca/pml/pml.h"
+#include "mca/btl/btl.h"
+

 /* local functions and data */
 #define HIER_MAXPROTOCOL 7
 static int mca_coll_hierarch_max_protocol=HIER_MAXPROTOCOL;

-static char hier_prot[HIER_MAXPROTOCOL][5]={"0","tcp","ib","gm","mx","elan4","sm"};
+static char hier_prot[HIER_MAXPROTOCOL][6]={"0","tcp","gm","mx","mvapi","openib","sm"};

 static void mca_coll_hierarch_checkfor_component (struct ompi_communicator_t *comm,
 						  char *component_name, int *key,
@ -113,7 +117,6 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority,
    
    /* This module only works for intra-communicators at the moment */
    if ( OMPI_COMM_IS_INTER(comm) ) {
-	*priority = 0;
 	return NULL;
    }

@ -190,13 +193,11 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority,
 const struct mca_coll_base_module_1_0_0_t *
 mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
 {
-    int color, ncount;
-    int *colorarr=NULL, *llr=NULL;
+    int color;
+    int *llr=NULL;
    int size, rank, ret=OMPI_SUCCESS;
-    int i, j, c, level;
-    int found;

-    struct ompi_communicator_t *llcomm=NULL;
+    struct ompi_communicator_t *lcomm=NULL;
    struct mca_coll_base_comm_t *data=NULL;

    rank = ompi_comm_rank(comm);
@ -207,13 +208,13 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
    
    /* Generate the subcommunicator based on the color returned by
       the previous function. */
-    ret = ompi_comm_split ( comm, color, rank, &llcomm, 0 );
+    ret = ompi_comm_split ( comm, color, rank, &lcomm, 0 );
    if ( OMPI_SUCCESS != ret ) {
 	goto exit;
    }

    data->hier_comm     = comm;
-    data->hier_llcomm   = llcomm;
+    data->hier_lcomm    = lcomm;
    data->hier_num_reqs = 2 * size;
    data->hier_reqs     = (ompi_request_t **) malloc (sizeof(ompi_request_t)*size*2);
    if ( NULL == data->hier_reqs ) {
@ -238,6 +239,21 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
 	data->hier_am_lleader = 1; /*true */
    }

+    /* Generate the lleader communicator assuming that all lleaders are the first
+       process in the list of processes with the same color. A function generating 
+       other lleader-comms will follow soon. */
+    ompi_comm_split ( comm, data->hier_am_lleader, rank, &llcomm, 0);
+    if ( OMPI_SUCCESS != ret ) {
+	goto exit;
+    }
+    data->hier_llcomm = (struct ompi_communicator_t *)malloc (HIER_DEFAULT_NUM_LLCOMM * 
+							      sizeof(struct ompi_communicator_t *));
+    if ( NULL == data->hier_llcomm ) {
+	goto exit;
+    }
+    data->hier_num_llcomm = HIER_DEFAULT_NUM_LLCOMM;
+    data->hier_llcomm[0] = llcomm;
+

    /* This is the point where I will introduce later on a function trying to 
       compact the colorarr array. Not done at the moment */
@ -248,7 +264,7 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
 	free (llr);
    }
    if ( OMPI_SUCCESS != ret ) {
-	ompi_comm_free ( &llcomm );
+	ompi_comm_free ( &lcomm );
 	if ( NULL != data ) {
 	    if ( NULL != data->hier_reqs ) {
 		free ( data->hier_reqs);
@ -274,19 +290,16 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
 */
 int mca_coll_hierarch_module_finalize(struct ompi_communicator_t *comm)
 {
-    struct ompi_communicator_t *llcomm=NULL;
+    struct ompi_communicator_t *lcomm=NULL;
    struct mca_coll_base_comm_t *data=NULL;

    data   = comm->c_coll_selected_data;
-    llcomm = data->hier_llcomm;
+    lcomm = data->hier_lcomm;

-    ompi_comm_free (&llcomm);
+    ompi_comm_free (&lcomm);
    free ( data->hier_reqs );
    free ( data->hier_lleaders );
    free ( data->hier_colorarr );
-    if ( NULL != data->hier_topo.topo_next ) {
-	free (data->hier_topo.topo_next);
-    }
    free ( data );
    
    comm->c_coll_selected_data = NULL;
@ -316,86 +329,78 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm,
 				       int *key,
 				       int *ncount )
 {
-    mca_pml_proc_t *proc=NULL;
-    mca_ptl_proc_t *ptl_proc=NULL;
-    mca_ptl_base_module_t *ptl_module=NULL;
-    mca_ptl_base_component_t *ptr=NULL;
+    ompi_bitmap_t reachable;
+    ompi_proc_t **procs=NULL;
+    struct mca_bml_base_endpoint_t **bml_endpoints=NULL;
+    struct mca_bml_base_btl_array_t *bml_btl_array=NULL;
+    mca_bml_base_btl_t *bml_btl=NULL;
+    mca_btl_base_component_t *btl=NULL;

-    int i, j, size;
+    int i, size, rc;

    int counter=0;
    int firstproc=999999;
    int rank = -1;
-
-    int listsize=1;
-    int use_next, walk_through_list;
+    int use_rdma=0;

    /* default values in case an error occurs */
    *ncount=0;
    *key=MPI_UNDEFINED;

-    /* Shall we just check the first element in the ptl list ? */
-    if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_walk_through_list_param, 
-						  &walk_through_list)) {
+    /* Shall we check the the rdma list instead of send-list in the endpoint-structure? */
+/*    if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_rdma_param, 
+						  &use_rdma)) {
 	return;
    }
-
-    /* Shall we use the first_elem list or the next_elem list? */
-    if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_use_next_param, 
-						  &use_next)) {
-	return;
-    }
-
+*/
    
    size = ompi_comm_size ( comm );
    rank = ompi_comm_rank ( comm );
+
+    OBJ_CONSTRUCT(&reachable, ompi_bitmap_t);
+    rc = ompi_bitmap_init(&reachable, size);
+    if(OMPI_SUCCESS != rc) {
+        return;
+    }
+
+    rc = mca_bml.bml_add_procs ( 
+	size, 
+	procs, 
+	bml_endpoints, 
+	&reachable 
+	);
+
+    if(OMPI_SUCCESS != rc) {
+	return;
+    }
+
    for ( i=0; i<size; i++ ) {
 	if ( rank ==  i ) {
 	    /* skip myself */
 	    continue;
 	}
-
-	proc = mca_pml_teg_proc_lookup_remote (comm, i);
-	if ( use_next ) {
-	    ptl_proc=mca_ptl_array_get_next(&proc->proc_ptl_first);
-	    if ( walk_through_list ) {
-		/* 
-		 * Walking through the listmight be unecessary.  Assumption is,
-		 * that if we did not register this as the first protocol, there is
-		 * a protocol which is faster than this one.  
-		 *
-		 * Example: on a IB-cluster with dual processor nodes, I can talk
-		 * to all procs with IB, however the process on my node will
-		 * hopefully have sm registered as its first protocoll. 
-		 */
-		
-		listsize = mca_ptl_array_get_size(&proc->proc_ptl_first);
-	    }
+	
+	if ( use_rdma ) {
+	    bml_btl_array = &(bml_endpoints[i]->btl_rdma);
 	}
 	else {
-	    ptl_proc=mca_ptl_array_get_next(&proc->proc_ptl_next);
-	    if ( walk_through_list ) {
-		listsize = mca_ptl_array_get_size(&proc->proc_ptl_next);
-	    }
+	    bml_btl_array = &(bml_endpoints[i]->btl_send);
 	}
+	bml_btl = mca_bml_base_btl_array_get_index ( bml_btl_array, 0 );
+	btl = bml_btl->btl->btl_component;

-	for ( j=0; j<listsize;j++) {
-	    ptl_module = ptl_proc->ptl;
-	    ptr = ptl_module->ptl_component;
+	/* sanity check */
+	if ( strcmp(btl->btl_version.mca_type_name,"btl") ) {
+	    printf("Oops, got the wrong component! type_name = %s\n",
+		   btl->btl_version.mca_type_name );
+	}
 	    
-	    /* sanity check */
-	    if ( strcmp(ptr->ptlm_version.mca_type_name,"ptl") ) {
-		printf("Oops, got the wrong component! type_name = %s\n",
-		       ptr->ptlm_version.mca_type_name );
-	    }
+	/* check for the required component */
+	if (! strcmp (btl->btl_version.mca_component_name, component_name)){
+	    counter++;
 	    
-	    /* check for the required component */
-	    if (! strcmp (ptr->ptlm_version.mca_component_name, component_name)){
-		counter++;
-		
-		if (i<firstproc ) {
-		    firstproc = i;
-		}
+	    if (i<firstproc ) {
+		firstproc = i;
 	    }
 	}
    }
--- a/ompi/mca/coll/hierarch/coll_hierarch.h
+++ b/ompi/mca/coll/hierarch/coll_hierarch.h
@ -23,7 +23,6 @@
 #include "mca/mca.h"
 #include "mca/coll/coll.h"
 #include "request/request.h"
-#include "mca/pml/pml.h"

 #if defined(c_plusplus) || defined(__cplusplus)
 extern "C" {
@ -40,33 +39,32 @@ extern int mca_coll_hierarch_verbose;
 extern int mca_coll_hierarch_walk_through_list_param;
 extern int mca_coll_hierarch_use_next_param;

+
+#define HIER_DEFAULT_NUM_LLCOMM 5
 /*
 * Data structure for attaching data to the communicator 
 */

-    struct mca_coll_hierarch_topo {
-	int topo_root;
-	int topo_prev;
-	int topo_nextsize;
-	int topo_maxsize;
-	int *topo_next;
+    struct mca_coll_hierarch_llead {
+	struct ompi_communicator_t    *hier_llcomm; /* local leader communicator */
+	int                      hier_num_lleaders; /* number of local leaders */
+	int                         *hier_lleaders; /* list of local leaders, ranks in comm */
+	int               hier_my_lleader_on_lcomm; /* rank of my lleader in llcomm */
+	int                        hier_am_lleader; /* am I an lleader? */
+	int                        hier_my_lleader; /* pos. of my lleader in hier_lleaders */
    };

    struct mca_coll_base_comm_t {
-	struct ompi_communicator_t   *hier_comm; /* link back to the attached comm */ 
-	struct ompi_communicator_t *hier_llcomm; /* low level communicator */
-	int                          hier_level; /* level in the hierarchy. just debugging */
-	int                   hier_num_lleaders; /* number of local leaders */
-	int                      *hier_lleaders; /* list of local leaders, ranks in comm */
-	int                     hier_my_lleader; /* pos. of my lleader in hier_lleaders */
-	int           hier_my_lleader_on_llcomm; /* rank of my lleader in llcomm */
-	int                     hier_am_lleader; /* am I an lleader? */
-	int                       hier_num_reqs; /* num. of requests */
-	ompi_request_t              **hier_reqs; /* list of requests */
-	int                  hier_type_colorarr; /* format in which the colorarr is stored */
-	int                   hier_num_colorarr; /* size of the colorarr array */
-	int*                      hier_colorarr; /* array containing the color of all procs */
-	struct mca_coll_hierarch_topo hier_topo; /* topology used in the coll ops */
+	struct ompi_communicator_t      *hier_comm; /* link back to the attached comm */ 
+	struct ompi_communicator_t     *hier_lcomm; /* low level communicator */
+	struct mca_coll_hierarch_llead *hier_llead; /* structure for lleader communicator */
+	int                         hier_num_llead; /* number of llead structs */
+	int                             hier_level; /* level in the hierarchy. just debugging */
+ 	int                          hier_num_reqs; /* num. of requests */
+	ompi_request_t                 **hier_reqs; /* list of requests */
+	int                     hier_type_colorarr; /* format in which the colorarr is stored */
+	int                      hier_num_colorarr; /* size of the colorarr array */
+	int*                         hier_colorarr; /* array containing the color of all procs */
    };

 /* These are various modes how the colorarr is stored. The reason
@ -163,6 +161,13 @@ static inline void mca_coll_hierarch_get_all_lleaders ( int size, int *carr, int
    return;
 }

+static inline struct ompi_communicator_t*  mca_coll_hierarch_get_llcomm ( int rank, 
+									  struct mca_coll_base_comm_t *data,
+									  int* lleader ) 
+{
+    return NULL;
+}
+
 static inline void mca_coll_hierarch_get_lleader (int rank, struct mca_coll_base_comm_t *data,
 						  int* lleader ) 
 {
--- a/ompi/mca/coll/hierarch/coll_hierarch_barrier.c
+++ b/ompi/mca/coll/hierarch/coll_hierarch_barrier.c
@ -35,7 +35,7 @@
 int mca_coll_hierarch_barrier_intra(struct ompi_communicator_t *comm)
 {
  opal_output_verbose(10, mca_coll_base_output, "In hierarch barrier_intra");
-  return comm->c_coll_basic_module->coll_barrier(comm);
+  return MPI_SUCCESS;
 }


--- a/ompi/mca/coll/hierarch/coll_hierarch_bcast.c
+++ b/ompi/mca/coll/hierarch/coll_hierarch_bcast.c
@ -20,6 +20,7 @@
 #include "mpi.h"
 #include "ompi/include/constants.h"
 #include "opal/util/output.h"
+#include "communicator/communicator.h"
 #include "mca/coll/coll.h"
 #include "mca/coll/base/base.h"
 #include "mca/coll/base/coll_tags.h"
@ -28,28 +29,12 @@
 /*
 *	bcast_intra
 *
- *	Function:	- broadcast using O(N) algorithm
+ *	Function:	- broadcast using hierarchical algorithm
 *	Accepts:	- same arguments as MPI_Bcast()
 *	Returns:	- MPI_SUCCESS or error code
 */


-static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, 
-						     int count, 
-						     ompi_datatype_t * datatype, 
-						     int root, 
-						     ompi_communicator_t * comm, 
-						     int segsize,
-						     struct mca_coll_hierarch_topo *topo);
-
-static int mca_coll_hierarch_intra_bcast_setup_topo (int count, 
-						     ompi_datatype_t *datatype, 
-						     int root, 
-						     struct mca_coll_base_comm_t *data,
-						     int *segsize);
-static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data );
-
-


 int mca_coll_hierarch_bcast_intra(void *buff, 
@ -60,44 +45,25 @@ int mca_coll_hierarch_bcast_intra(void *buff,
 {
    struct mca_coll_base_comm_t *data=NULL;
    struct ompi_communicator_t *llcomm=NULL;
-    int lleader_of_root, lleader_replaced_by_root=0;
-    int rank, ret, lroot;
-    int segsize;
+    struct ompi_communicator_t *lcomm=NULL;
+    int lleader;
+    int rank, ret, llroot;

    rank   = ompi_comm_rank ( comm );
    data   = comm->c_coll_selected_data;
-    llcomm = data->hier_llcomm;
+    lcomm  = data->hier_lcomm;

-    /* Determine whether
-       a) we have the same local leader like the root of this operation
-       b) the root and the local leader are the identical
-
-       If a) is true and b) not, we will replace the local leader for this
-       subgroup by the root 
+    /* This function returns the local leader communicator
+       which *always* contains the root of this operation.
+       This might involve creating a new communicator. This is 
+       also the reason, that *every* process in comm has to call 
+       this function
    */
-
-    mca_coll_hierarch_get_lleader (root, data, &lleader_of_root);
-    if ( (lleader_of_root == data->hier_my_lleader) && (lleader_of_root != root )) {
-	lleader_replaced_by_root = 1;
-    }
+    llcomm = mca_coll_hierarch_get_llcomm ( root, data, &llroot);

    /* Bcast on the upper level among the local leaders */
-    if ( rank == root || ( data->hier_am_lleader && !lleader_replaced_by_root) ) {
-	/* this functions sets up the topology used in the segmented
-	   bcast afterwards and determines the segment size. */
-	ret = mca_coll_hierarch_intra_bcast_setup_topo (count, datatype, root, 
-							data, &segsize);
-	if ( OMPI_SUCCESS != ret ) {
-	    return ret;
-	}
-	/* ok, do now the actual bcast. Hopefully, this routine will come
-	   out of Jelena's collective module in the end. For the moment,
-	   I've implemented it myself 
-	*/
-	ret = mca_coll_hierarch_intra_segmented_bcast (buff, count,
-						       datatype, root,
-						       comm, segsize, 
-						       &(data->hier_topo));
+    if ( MPI_UNDEFINED == llroot ) {
+	llcomm->c_coll.coll_bcast(buff, count, datatype, llroot, llcomm);
 	if ( OMPI_SUCCESS != ret ) {
 	    return ret;
 	}
@ -106,271 +72,11 @@ int mca_coll_hierarch_bcast_intra(void *buff,
    /* once the local leaders got the data from the root, they can distribute
       it to the processes in their local, low-leve communicator.
    */
+
    if ( MPI_COMM_NULL != llcomm ) {
-	if ( lleader_replaced_by_root ) {
-	    mca_coll_hierarch_map_rank(root, data, &lroot);
-	    ret = llcomm->c_coll.coll_bcast(buff, count, datatype, lroot,
-					    llcomm);
-	}
-	else {
-	    /* Assumption: the rank of the local leader on llcomm is always 0 */	       
-	    ret = llcomm->c_coll.coll_bcast(buff, count, datatype, 0, llcomm );
-	}
+	mca_coll_hierarch_get_lleader (root, data, &lleader);
+	ret = lcomm->c_coll.coll_bcast(buff, count, datatype, lleader, lcomm );
    }

    return  ret;
 }
-
-
-
-/* 
- *  This is the mother of all segmented bcast algorithms of any type.
- *  Due to the general structure of the topo argument, you can use this function
- *  for any type of algorith - it just depends on the settings of topo.
- *  
- *  The implementation is strongly leaning on the implementation in FT-MPI.
- */
-
-static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, 
-						     int count, 
-						     ompi_datatype_t * datatype, 
-						     int root, 
-						     ompi_communicator_t * comm, 
-						     int segsize,
-						     struct mca_coll_hierarch_topo *topo)
-{
-  int err=0, i, j;
-  int size, rank;
-  int segcount;       /* Number of elements sent with each segment */
-  int num_segments;   /* Number of segmenets */
-  int recvcount;      /* the same like segcount, except for the last segment */ 
-  int typelng, realsegsize;
-  char *tmpbuf;
-  long rlb, ext;
-  ompi_request_t ** recv_request= NULL;
-
-  size = ompi_comm_size ( comm );
-  rank = ompi_comm_rank ( comm );
-
-  /* ------------------------------------------- */
-  /* special case for size == 1 and 2 */
-  if (size == 1) {
-    return OMPI_SUCCESS;
-  }
-  if (size == 2) {
-    if (rank == root) {
-      err = mca_pml.pml_send(buffer, count, datatype, (rank+1)%2, 
-			     MCA_COLL_BASE_TAG_BCAST, 
-			     MCA_PML_BASE_SEND_STANDARD, comm );
-      if ( OMPI_SUCCESS != err ) { 
-	  return err;
-      }
-    } else {
-	err = mca_pml.pml_recv(buffer, count, datatype, root,
-			       MCA_COLL_BASE_TAG_BCAST, comm, 
-			       MPI_STATUS_IGNORE);
-      if ( OMPI_SUCCESS != err) {
-	  return err;      
-      }
-    }
-    return OMPI_SUCCESS;
-  }
-  /* end special case for size == 1 and 2 */
-
-
-  tmpbuf = (char *) buffer;
-  /* -------------------------------------------------- */
-  /* Determine number of segments and number of elements
-     sent per operation  */
-  err = ompi_ddt_type_size( datatype, &typelng);
-  if (  OMPI_SUCCESS != err) {
-      return ( err );
-  }
-
-  if ( segsize > 0 ) {
-      segcount     = segsize/typelng; 
-      num_segments = count/segcount;
-      if (0 != (count % segcount)) {
-	  num_segments++;
-      }
-  }
-  else  {
-      segcount     = count;
-      num_segments = 1;
-  }
-  
-  /* Determine real segment size = segcount * extent */
-  err = ompi_ddt_get_extent( datatype, &rlb, &ext );
-  if ( OMPI_SUCCESS != err) {
-      return ( err );
-  }
-  realsegsize = segcount*ext;
-
-  /* ----------------------------------------------------- */
-  /* Post Irecv if not root-node */
-  if (rank != root)  {
-      /* has a parent. need to receive before sending */
-      if ( num_segments > 2 * size ) {
-	  recv_request = (MPI_Request*)malloc ( sizeof(ompi_request_t *)*num_segments );
-      }
-      else {
-	  recv_request = comm->c_coll_selected_data->hier_reqs;
-      }
-
-      for( i = 0; i < num_segments; i++) {
-	  if ( i == (num_segments -1) ) {
-	    recvcount = count - (segcount * i);
-	  }
-	  else {
-	    recvcount = segcount;
-	  }
-	  err = mca_pml.pml_irecv(tmpbuf+i*realsegsize, recvcount, datatype,
-				  topo->topo_prev, MCA_COLL_BASE_TAG_BCAST, 
-				  comm, &recv_request[i]);
-	  if ( OMPI_SUCCESS != err ) {
-	      return ( err );
-	  }
-	}
-    }
-
-  /* ---------------------------------------------- */
-  /* If leaf node, just finish the receive */
-  if (topo->topo_nextsize == 0)  {
-      if(recv_request != NULL) {
-	  err = ompi_request_wait_all (num_segments, recv_request, MPI_STATUSES_IGNORE);
-	  if ( OMPI_SUCCESS != err ) {
-	      return ( err );
-	  }
-      }
-  }
-  else  {
-      /* ------------------------------------------ */
-      /* root or intermediate node */      
-      for( i = 0; i < num_segments; i++) {
-	  if (rank != root)  {
-	      /* intermediate nodes have to wait for the completion of
-		 the corresponding receive */
-	      err = ompi_request_wait_all(1, &recv_request[i], MPI_STATUS_IGNORE);
-	      if ( OMPI_SUCCESS != err ) {
-		  return ( err );
-	      }
-	    }
-	  for ( j = 0; j < topo->topo_nextsize; j++)   {
-	      if ( i == ( num_segments - 1 )) {
-		  recvcount = count - ( segcount * i);
-	      }
-	      else {
-		  recvcount = segcount;
-	      }
-
-	      err = mca_pml.pml_send(tmpbuf+i*realsegsize, recvcount, 
-				     datatype, topo->topo_next[j], 
-				     MCA_COLL_BASE_TAG_BCAST, 
-				     MCA_PML_BASE_SEND_STANDARD, comm );
-	      if( OMPI_SUCCESS != err )  {
-		  return ( err );
-	      }
-	  } /* for ( j = 0; j < topo_nextsize; j++) */
-      } /* for ( i = 0; i < num_segments; i++) */
-  }
-  
-  if ( num_segments > 2 * size ) {
-      if(recv_request != NULL) {
-	  free(recv_request);
-      }
-  }
-
-  return OMPI_SUCCESS;
-} 
-
-
-/* 
- * This routine does the magic to determine, which topology (bmtree, linear, chain etc)
- *  would perform best in this scenario. At the moment, we just do bmtree.
- *
- * The implementation is once again strongly related to the version in FT-MPI.
- */
-static int mca_coll_hierarch_intra_bcast_setup_topo (int count, 
-						     ompi_datatype_t *datatype, 
-						     int root, 
-						     struct mca_coll_base_comm_t *data,
-						     int *segsize)
-{
-    /* without spending time on that issues, I set for the moment segsize to 32k. */
-    *segsize = 32768;
-    
-    /* without spending time on that issue, I set the topology to a binomial tree */
-    setup_topo_bmtree ( root, data );
-
-    return OMPI_SUCCESS;
-}
-
-
-static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data ) 
-{
-    /* This implementation is based on the closest first bmtree algorithms 
-       in FT-MPI implemnented by George/Jelena, has however a couple of 
-       significant modifications:
-       - we are not having a contiguous list of participating processes,
-         but a list containing the ranks of the participating processes.
-    */
-
-    int childs = 0;
-    int rank, size, mask=1;
-    int index, remote, found;
-    int rootpos;
-    struct mca_coll_hierarch_topo *topo=&(data->hier_topo);
-
-    
-
-    if (found) {
-	size = data->hier_num_lleaders;
-    }
-    else {
-	size = data->hier_num_lleaders + 1;	
-	data->hier_lleaders[rootpos] = root;
-    }
-    rank = data->hier_my_lleader;
-
-    /* allocate the array of childprocesses, if not yet done */
-    if ( NULL == topo->topo_next && 0 == topo->topo_maxsize ) {
-	topo->topo_next = (int *) malloc (data->hier_num_lleaders+1 * sizeof(int));
-	if ( NULL != topo->topo_next ) {
-	    return;
-	}
-	topo->topo_maxsize=data->hier_num_lleaders+1;
-    }
-
-    index = rank - rootpos;
-
-    if( index < 0 ) index += size;
-    while( mask <= index ) mask <<= 1;
-    
-    /* Determine the rank of my father */
-    if( rootpos == rank ) {
-	topo->topo_prev = root;
-    }
-    else {
-	remote = (index ^ (mask >> 1)) + rootpos;
-	if( remote >= size ) {
-	    remote -= size;
-	}
-	topo->topo_prev = data->hier_lleaders[remote];
-    }
-
-    /* And now let's fill my childs */
-    while( mask < size ) {
-	remote = (index ^ mask);
-	if( remote >= size ) break;
-	remote += rootpos;
-	if( remote >= size ) remote -= size;
-	topo->topo_next[childs] = data->hier_lleaders[remote];
-	mask <<= 1;
-	childs++;
-    }
-
-    topo->topo_nextsize = childs;
-    return;
-}
-
-#endif
--- a/ompi/mca/coll/hierarch/coll_hierarch_component.c
+++ b/ompi/mca/coll/hierarch/coll_hierarch_component.c
@ -22,11 +22,9 @@

 #include "ompi_config.h"
 #include "coll_hierarch.h"
-#include "coll-hierarch-version.h"

 #include "mpi.h"
 #include "mca/coll/coll.h"
-#include "coll_hierarch.h"

 /*
 * Public string showing the coll ompi_hierarch component version number
@ -40,8 +38,7 @@ const char *mca_coll_hierarch_component_version_string =
 int mca_coll_hierarch_priority_param = -1;
 int mca_coll_hierarch_verbose_param = -1;
 int mca_coll_hierarch_verbose = 0;
-int mca_coll_hierarch_walk_through_list_param=-1; 
-int mca_coll_hierarch_use_next_param=-1;   
+int mca_coll_hierarch_use_rdma_param=-1;   


 /*
@ -82,12 +79,10 @@ const mca_coll_base_component_1_0_0_t mca_coll_hierarch_component = {

  {
   /* Whether the component is checkpointable or not */
-
   true
  },

  /* Initialization / querying functions */
-
  mca_coll_hierarch_init_query,
  mca_coll_hierarch_comm_query,
  mca_coll_hierarch_comm_unquery
@ -103,10 +98,8 @@ static int hierarch_open(void)
    mca_coll_hierarch_verbose_param = 
        mca_base_param_register_int("coll", "hierarch", "verbose", NULL, 
                                    mca_coll_hierarch_verbose);
-    mca_coll_hierarch_walk_through_list_param = 
-        mca_base_param_register_int("coll", "hierarch", "walk_through_list", NULL, 0);
    mca_coll_hierarch_use_next_param = 
-        mca_base_param_register_int("coll", "hierarch", "use_next", NULL, 0);
+        mca_base_param_register_int("coll", "hierarch", "use_rdma", NULL, 0);

    return OMPI_SUCCESS;
 }
--- a/ompi/mca/coll/hierarch/coll_hierarch_reduce.c
+++ b/ompi/mca/coll/hierarch/coll_hierarch_reduce.c
@ -27,7 +27,6 @@
 #include "coll_hierarch.h"


-#ifdef SIMPLE_HIERARCH
 /*
 *	reduce_intra
 *
@ -40,102 +39,5 @@ int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count,
                               struct ompi_op_t *op,
                               int root, struct ompi_communicator_t *comm)
 {
-    struct mca_coll_base_comm_t *data=NULL;
-    struct ompi_communicator_t *llcomm=NULL;
-    long true_lb, true_extent, lb, extent;
-    char *free_buffer = NULL;
-    char *pml_buffer = NULL;
-    int i, rank, ret;
-
-    rank   = ompi_comm_rank ( comm );
-    data   = comm->c_coll_selected_data;
-    llcomm = data->hier_llcomm;
-
-
-    /* 
-     * collect the data from the low-level communicators. Result will be stored
-     * on the local leaders.
-     */
-    if ( MPI_COMM_NULL != llcomm ) {
-	ret = llcomm->c_coll.coll_reduce(sbuf, rbuf, count, dtype, op,
-					 data->hier_my_lleader, llcomm );
-    }
-
-
-    /* trivial linear reduction receiving the data from all local leaders.
-       need something significantly better */
-    if ( rank == root ) {
-	/* Root receives and reduces messages  */
-	ompi_ddt_get_extent(dtype, &lb, &extent);
-	ompi_ddt_get_true_extent(dtype, &true_lb, &true_extent);
-	
-	free_buffer = malloc(true_extent + (count - 1) * extent);
-	if (NULL == free_buffer) {
-	    return OMPI_ERR_OUT_OF_RESOURCE;
-	}
-	pml_buffer = free_buffer - lb;
-	
-	if ( !data->hier_am_lleader ) {
-	    /* Initialize the receive buffer. */
-	    ret = mca_pml.pml_recv(rbuf, count, dtype, data->hier_lleader[0],
-				   MCA_COLL_BASE_TAG_REDUCE, comm, 
-				   MPI_STATUS_IGNORE);
-	    if (MPI_SUCCESS != ret) {
-		goto exit;
-	    }
-	}
-
-	/* Loop receiving and calling reduction function (C or Fortran). */
-	for (i = 1; i < data->hier_num_lleaders; i++) {
-	    if ( data->hier_lleader[i] == rank ) {
-		continue;
-	    }
-	    ret = mca_pml.pml_recv(pml_buffer, count, dtype, data->hier_lleader[i], 
-				   MCA_COLL_BASE_TAG_REDUCE, comm, 
-				   MPI_STATUS_IGNORE);
-	    if (MPI_SUCCESS != ret) {
-		goto exit;
-	    }
-	    
-          /* Perform the reduction */
-          ompi_op_reduce(op, pml_buffer, rbuf, count, dtype);
-	}
-    }
-    else if ( data->hier_am_lleader ) {
-	if ( MPI_COMM_NULL != llcomm ) {
-	    ret = mca_pml.pml_send ( rbuf, count, dtype, root, 
-				     MCA_COLL_BASE_TAG_REDUCE, 
-				     MCA_PML_BASE_SEND_STANDARD, 
-				     comm);
-	}
-	else {
-	    ret = mca_pml.pml_send ( sbuf, count, dtype, root, 
-				     MCA_COLL_BASE_TAG_REDUCE, 
-				     MCA_PML_BASE_SEND_STANDARD, 
-				     comm);
-	}	    
-    }
-    
- exit:
-    if ( NULL != free_buffer ) {
-	free ( free_buffer);
-    }
-
-
-    return  ret;
+    return  OMPI_SUCCESS;
 }
-
-#else
-
-int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count,
-                               struct ompi_datatype_t *dtype, 
-                               struct ompi_op_t *op,
-                               int root, struct ompi_communicator_t *comm)
-{
-  opal_output_verbose(10, mca_coll_base_output, "In hierarch reduce_intra");
-  return comm->c_coll_basic_module->coll_reduce(sbuf, rbuf, count, dtype,
-                                                op, root, comm);
-}
-
-#endif
-