next step: the interface still needs discussion (especially with UT), its however pretty close to what I think it should be. Hopefully, the bcast_segmented routine will be removed again soon and used from the UT coll module.

This commit was SVN r4628.
2005-03-02 13:28:39 +00:00 · 2005-03-02 13:28:39 +00:00 · 6579de7791
--- a/src/mca/coll/hierarch/coll_hierarch.c
+++ b/src/mca/coll/hierarch/coll_hierarch.c
@ -40,6 +40,7 @@
 static void mca_coll_hierarch_checkfor_component (struct ompi_communicator_t *comm,
 						  char *component_name, int *key,
 						  int *done );
+static void mca_coll_hierarch_dump_struct ( struct mca_coll_base_comm_t *c);

 /*
 * Linear set of collective algorithms
@ -129,17 +130,11 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority,

    mca_coll_hierarch_checkfor_component ( comm, "sm", &color, &ncount);
 
-#define KNOW_HOW_TO_CALL_ALLGATHER
-#ifdef KNOW_HOW_TO_CALL_ALLGATHER
    comm->c_coll_basic_module->coll_allreduce (&ncount, &maxncount, 1, MPI_INT, 
 					       MPI_MAX, comm );
-
    comm->c_coll_basic_module->coll_allgather (&color, 1, MPI_INT, 
 					       colorarr, 1, MPI_INT, comm );

-#else
-    maxncount = ncount;
-#endif
    if ( 1 == maxncount ) {
 	/* 
 	 * this means, no process has a partner to which it can talk with 'sm',
@ -197,7 +192,7 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
        
    data->hier_llcomm   = llcomm;
    data->hier_num_reqs = 2 * size;
-    data->hier_reqs     = (ompi_request_t **) malloc (sizeof(ompi_request_t)*2*size);
+    data->hier_reqs     = (ompi_request_t **) malloc (sizeof(ompi_request_t)*size*2);
    if ( NULL == data->hier_reqs ) {
 	goto exit;
    }
@ -236,7 +231,9 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
    }
    
    data->hier_num_lleaders = c-1;
-    data->hier_lleaders = (int *) malloc ( sizeof(int) * data->hier_num_lleaders);
+    /* we allocate one more element than required to be able to add the 
+       root of an operation to this list */
+    data->hier_lleaders = (int *) malloc ( sizeof(int) * data->hier_num_lleaders + 1);
    if ( NULL == data->hier_lleaders ) {
 	goto exit;
    }
@ -260,6 +257,7 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
 	    if ( NULL != data->hier_lleaders ) {
 		free ( data->hier_lleaders);
 	    }
+
 	    free ( data );
 	}
 	return NULL;
@ -283,6 +281,9 @@ int mca_coll_hierarch_module_finalize(struct ompi_communicator_t *comm)
    ompi_comm_free (&llcomm);
    free ( data->hier_reqs);
    free ( data->hier_lleaders);
+    if ( NULL != data->hier_topo.topo_next ) {
+	free (data->hier_topo.topo_next);
+    }
    free ( data );
    
    comm->c_coll_selected_data = NULL;
@ -326,7 +327,7 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm,
    for ( i=0; i<size; i++ ) {
 	proc = mca_pml_teg_proc_lookup_remote (comm, i);

-#ifdef TRY_NEXT_INSTEAD OF FIRST
+#ifdef TRY_NEXT_INSTEAD_OF_FIRST
 	ptl_proc=mca_ptl_array_get_next(&proc->proc_ptl_next);
 	listsize = mca_ptl_array_get_size(&proc->proc_ptl_next);
 #else
@ -356,9 +357,6 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm,
 		
 	    /* check for the required component */
 	    if (! strcmp (ptr->ptlm_version.mca_component_name, component_name)){
-#ifdef VERBOSE
-		printf("found component %s for rank %d \n", component_name, i );
-#endif
 		counter++;
 		
 		if (i<firstproc ) {
@ -369,20 +367,13 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm,
    }
    
    *ncount = counter; /* true */
-    /* Print the final result */
+    /* final decision */
    if ( counter == 1 ) {
 	/* this is the section indicating, that we are not 
 	   using this component */
 	if ( myrank == -1 ) {
-#ifdef VERBOSE
-	    printf("something really weird has happened!\n");
-#endif
 	}
 	else {
-#ifdef VERBOSE
-	    printf("component  %s is not used to talk to anyone in this comm\n",
-		   component_name );
-#endif
 	    firstproc = MPI_UNDEFINED;
 	}
    }
@ -390,19 +381,25 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm,
 	if ( myrank < firstproc ) {
 	    firstproc = myrank;
 	}
-#ifdef VERBOSE
-	if ( counter == size ) {
-	    printf("I can talk to all processes in this comm using %s key=%d\n",
-		   component_name, firstproc );
-	}
-	else {
-	    printf(" I can talk to %d processes in this comm using %s key=%d\n",
-		   counter, component_name, firstproc );
-	}
-#endif
    }

    *key = firstproc;
    return;
 }

+
+
+static void mca_coll_hierarch_dump_struct ( struct mca_coll_base_comm_t *c)
+{
+    int i;
+
+    printf("Dump of hier-struct for  comm %s cid %u\n", 
+	   c->hier_comm->c_name, c->hier_comm->c_contextid);
+    printf("  no of lleaders %d my_leader %d am_leader %d\n", 
+	   c->hier_num_lleaders, c->hier_my_lleader, c->hier_am_lleader );
+    for (i=0; i<c->hier_num_lleaders; i++ ) {
+	printf("      lleader[%d] = %d\n", i, c->hier_lleaders[i]);
+    }
+
+    return;
+}
--- a/src/mca/coll/hierarch/coll_hierarch.h
+++ b/src/mca/coll/hierarch/coll_hierarch.h
@ -39,7 +39,17 @@ extern int mca_coll_hierarch_verbose;
 /*
 * Data structure for attaching data to the communicator 
 */
+
+    struct mca_coll_hierarch_topo {
+	int topo_root;
+	int topo_prev;
+	int topo_nextsize;
+	int topo_maxsize;
+	int *topo_next;
+    };
+
    struct mca_coll_base_comm_t {
+	struct ompi_communicator_t   *hier_comm; /* link back to the attached comm */ 
 	struct ompi_communicator_t *hier_llcomm; /* low level communicator */
 	int                   hier_num_lleaders; /* number of local leaders */
 	int                      *hier_lleaders; /* list of local leaders */
@ -47,10 +57,22 @@ extern int mca_coll_hierarch_verbose;
 	int                     hier_am_lleader; /* am I an lleader? */
 	int                       hier_num_reqs; /* num. of requests */
 	ompi_request_t              **hier_reqs; /* list of requests */
+	struct mca_coll_hierarch_topo hier_topo; /* topology used in the coll ops */
    };



+#define MCA_COLL_HIERARCH_IS_ROOT_LLEADER(_root, _lls, _llsize, _found, _pos) { \
+        int _i;                                                                  \
+        for (_found=0, _pos=_llsize, _i=0; _i<_llsize; _i++) {                  \
+           if ( _lls[_i] == _root )      {                                       \
+               _found = 1;                                                       \
+               _pos = _i;                                                        \
+               break;                                                            \
+           }                                                                     \
+        }                                                                        \
+     }
+
 /*
 * coll API functions
 */
--- a/src/mca/coll/hierarch/coll_hierarch_barrier.c
+++ b/src/mca/coll/hierarch/coll_hierarch_barrier.c
@ -37,15 +37,3 @@ int mca_coll_hierarch_barrier_intra(struct ompi_communicator_t *comm)
 }


-/*
- *	barrier_inter
- *
- *	Function:	- barrier using O(log(N)) algorithm
- *	Accepts:	- same as MPI_Barrier()
- *	Returns:	- MPI_SUCCESS or error code
- */
-int mca_coll_hierarch_barrier_inter(struct ompi_communicator_t *comm)
-{
-  ompi_output_verbose(10, mca_coll_base_output, "In hierarch barrier_inter");
-  return comm->c_coll_basic_module->coll_barrier(comm);
-}
--- a/src/mca/coll/hierarch/coll_hierarch_bcast.c
+++ b/src/mca/coll/hierarch/coll_hierarch_bcast.c
@ -23,7 +23,22 @@
 #include "mca/coll/base/coll_tags.h"
 #include "coll_hierarch.h"

+static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, 
+						     int count, 
+						     ompi_datatype_t * datatype, 
+						     int root, 
+						     ompi_communicator_t * comm, 
+						     int segsize,
+						     struct mca_coll_hierarch_topo *topo);

+static int mca_coll_hierarch_intra_bcast_setup_topo (int count, 
+						     ompi_datatype_t *datatype, 
+						     int root, 
+						     struct mca_coll_base_comm_t *data,
+						     int *segsize);
+static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data );
+
+#ifdef SIMPLE_HIERARCH_BCAST
 /*
 *	bcast_intra
 *
@ -31,8 +46,10 @@
 *	Accepts:	- same arguments as MPI_Bcast()
 *	Returns:	- MPI_SUCCESS or error code
 */
-int mca_coll_hierarch_bcast_intra(void *buff, int count,
-				  struct ompi_datatype_t *datatype, int root,
+int mca_coll_hierarch_bcast_intra(void *buff, 
+				  int count,
+				  struct ompi_datatype_t *datatype, 
+				  int root,
 				  struct ompi_communicator_t *comm)
 {
    struct mca_coll_base_comm_t *data=NULL;
@ -47,7 +64,7 @@ int mca_coll_hierarch_bcast_intra(void *buff, int count,
       need something significantly better */
    if ( rank == root ) {
 	for (i=0; i< data->hier_num_lleaders; i++) {
-	    if ( data->hier_lleaders[i] == data->hier_my_lleader ) {
+	    if ( data->hier_lleaders[i] == root ) {
 		data->hier_reqs[i] = MPI_REQUEST_NULL;
 		continue;
 	    }
@ -59,13 +76,16 @@ int mca_coll_hierarch_bcast_intra(void *buff, int count,
 		return ret;
 	    }
 	}
+
 	ret = ompi_request_wait_all ( data->hier_num_lleaders, data->hier_reqs, 
 				      MPI_STATUSES_IGNORE);
 	if ( OMPI_SUCCESS != ret ) {
 	    return ret;
 	}
+
    }
-    else if ( data->hier_am_lleader ) {
+
+    if ( data->hier_am_lleader ) {
 	ret = mca_pml.pml_recv ( buff, count, datatype, root, 
 				 MCA_COLL_BASE_TAG_BCAST, comm, 
 				 MPI_STATUS_IGNORE );
@ -74,6 +94,9 @@ int mca_coll_hierarch_bcast_intra(void *buff, int count,
 	}
    }

+    /* once the local leaders got the data from the root, they can distribute
+       it to the processes in their local, low-leve communicator.
+    */
    if ( MPI_COMM_NULL != llcomm ) {
 	ret = llcomm->c_coll.coll_bcast(buff, count, datatype, 
 					data->hier_my_lleader, llcomm );
@ -81,3 +104,299 @@ int mca_coll_hierarch_bcast_intra(void *buff, int count,

    return  ret;
 }
+
+#else
+int mca_coll_hierarch_bcast_intra(void *buff, 
+				  int count,
+				  struct ompi_datatype_t *datatype, 
+				  int root,
+				  struct ompi_communicator_t *comm)
+{
+    struct mca_coll_base_comm_t *data=NULL;
+    struct ompi_communicator_t *llcomm=NULL;
+    int rank, ret;
+    int segsize;
+
+    rank   = ompi_comm_rank ( comm );
+    data   = comm->c_coll_selected_data;
+    llcomm = data->hier_llcomm;
+
+    if ( rank == root || data->hier_am_lleader ) {
+	/* this functions sets up the topology used in the segmented
+	   bcast afterwards and determines the segment size. */
+	ret = mca_coll_hierarch_intra_bcast_setup_topo (count, datatype, root, data,
+							&segsize);
+	if ( OMPI_SUCCESS != ret ) {
+	    return ret;
+	}
+	/* ok, do now the actual bcast. Hopefully, this routine will come
+	   out of Jelena's collective module in the end. For the moment,
+	   I've implemented it myself 
+	*/
+	ret = mca_coll_hierarch_intra_segmented_bcast (buff, count,
+						       datatype, root,
+						       comm, segsize, 
+						       &(data->hier_topo));
+	if ( OMPI_SUCCESS != ret ) {
+	    return ret;
+	}
+    }
+
+    /* once the local leaders got the data from the root, they can distribute
+       it to the processes in their local, low-leve communicator.
+    */
+    if ( MPI_COMM_NULL != llcomm ) {
+	ret = llcomm->c_coll.coll_bcast(buff, count, datatype, 
+					data->hier_my_lleader, llcomm );
+    }
+
+    return  ret;
+}
+
+/* 
+ *  This is the mother of all segmented bcast algorithms of any type.
+ *  Due to the general structure of the topo argument, you can use this function
+ *  for any type of algorith - it just depends on the settings of topo.
+ *  
+ *  The implementation is strongly leaning on the implementation in FT-MPI.
+ */
+
+static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, 
+						     int count, 
+						     ompi_datatype_t * datatype, 
+						     int root, 
+						     ompi_communicator_t * comm, 
+						     int segsize,
+						     struct mca_coll_hierarch_topo *topo)
+{
+  int err=0, i, j;
+  int size, rank;
+  int segcount;       /* Number of elements sent with each segment */
+  int num_segments;   /* Number of segmenets */
+  int recvcount;      /* the same like segcount, except for the last segment */ 
+  int typelng, realsegsize;
+  char *tmpbuf;
+  long rlb, ext;
+  ompi_request_t ** recv_request= NULL;
+
+  size = ompi_comm_size ( comm );
+  rank = ompi_comm_rank ( comm );
+
+  /* ------------------------------------------- */
+  /* special case for size == 1 and 2 */
+  if (size == 1) {
+    return OMPI_SUCCESS;
+  }
+  if (size == 2) {
+    if (rank == root) {
+      err = mca_pml.pml_send(buffer, count, datatype, (rank+1)%2, 
+			     MCA_COLL_BASE_TAG_BCAST, 
+			     MCA_PML_BASE_SEND_STANDARD, comm );
+      if ( OMPI_SUCCESS != err ) { 
+	  return err;
+      }
+    } else {
+	err = mca_pml.pml_recv(buffer, count, datatype, root,
+			       MCA_COLL_BASE_TAG_BCAST, comm, 
+			       MPI_STATUS_IGNORE);
+      if ( OMPI_SUCCESS != err) {
+	  return err;      
+      }
+    }
+    return OMPI_SUCCESS;
+  }
+  /* end special case for size == 1 and 2 */
+
+
+  tmpbuf = (char *) buffer;
+  /* -------------------------------------------------- */
+  /* Determine number of segments and number of elements
+     sent per operation  */
+  err = ompi_ddt_type_size( datatype, &typelng);
+  if (  OMPI_SUCCESS != err) {
+      return ( err );
+  }
+
+  if ( segsize > 0 ) {
+      segcount     = segsize/typelng; 
+      num_segments = count/segcount;
+      if (0 != (count % segcount)) {
+	  num_segments++;
+      }
+  }
+  else  {
+      segcount     = count;
+      num_segments = 1;
+  }
+  
+  /* Determine real segment size = segcount * extent */
+  err = ompi_ddt_get_extent( datatype, &rlb, &ext );
+  if ( OMPI_SUCCESS != err) {
+      return ( err );
+  }
+  realsegsize = segcount*ext;
+
+  /* ----------------------------------------------------- */
+  /* Post Irecv if not root-node */
+  if (rank != root)  {
+      /* has a parent. need to receive before sending */
+      recv_request = (MPI_Request*)malloc ( sizeof(ompi_request_t *)*num_segments );
+
+      for( i = 0; i < num_segments; i++) {
+	  if ( i == (num_segments -1) ) {
+	    recvcount = count - (segcount * i);
+	  }
+	  else {
+	    recvcount = segcount;
+	  }
+	  err = mca_pml.pml_irecv(tmpbuf+i*realsegsize, recvcount, datatype,
+				  topo->topo_prev, MCA_COLL_BASE_TAG_BCAST, 
+				  comm, &recv_request[i]);
+	  if ( OMPI_SUCCESS != err ) {
+	      return ( err );
+	  }
+	}
+    }
+
+  /* ---------------------------------------------- */
+  /* If leaf node, just finish the receive */
+  if (topo->topo_nextsize == 0)  {
+      if(recv_request != NULL) {
+	  err = ompi_request_wait_all (num_segments, recv_request, MPI_STATUSES_IGNORE);
+	  if ( OMPI_SUCCESS != err ) {
+	      return ( err );
+	  }
+      }
+  }
+  else  {
+      /* ------------------------------------------ */
+      /* root or intermediate node */      
+      for( i = 0; i < num_segments; i++) {
+	  if (rank != root)  {
+	      /* intermediate nodes have to wait for the completion of
+		 the corresponding receive */
+	      err = ompi_request_wait_all(1, &recv_request[i], MPI_STATUS_IGNORE);
+	      if ( OMPI_SUCCESS != err ) {
+		  return ( err );
+	      }
+	    }
+	  for ( j = 0; j < topo->topo_nextsize; j++)   {
+	      if ( i == ( num_segments - 1 )) {
+		  recvcount = count - ( segcount * i);
+	      }
+	      else {
+		  recvcount = segcount;
+	      }
+
+	      err = mca_pml.pml_send(tmpbuf+i*realsegsize, recvcount, 
+				     datatype, topo->topo_next[j], 
+				     MCA_COLL_BASE_TAG_BCAST, 
+				     MCA_PML_BASE_SEND_STANDARD, comm );
+	      if( OMPI_SUCCESS != err )  {
+		  return ( err );
+	      }
+	  } /* for ( j = 0; j < topo_nextsize; j++) */
+      } /* for ( i = 0; i < num_segments; i++) */
+  }
+  
+  if(recv_request != NULL) {
+      free(recv_request);
+  }
+
+  return OMPI_SUCCESS;
+} 
+
+
+/* 
+ * This routine does the magic to determine, which topology (bmtree, linear, chain etc)
+ *  would perform best in this scenario. At the moment, we just do bmtree.
+ *
+ * The implementation is once again strongly related to the version in FT-MPI.
+ */
+static int mca_coll_hierarch_intra_bcast_setup_topo (int count, 
+						     ompi_datatype_t *datatype, 
+						     int root, 
+						     struct mca_coll_base_comm_t *data,
+						     int *segsize)
+{
+    /* without spending time on that issues, I set for the moment segsize to 32k. */
+    *segsize = 32768;
+    
+    /* without spending time on that issue, I set the topology to a binomial tree */
+    setup_topo_bmtree ( root, data );
+
+    return OMPI_SUCCESS;
+}
+
+
+static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data ) 
+{
+    /* This implementation is based on the closest first bmtree algorithms 
+       in FT-MPI implemnented by George/Jelena, has however a couple of 
+       significant modifications:
+       - we are not having a contiguous list of participating processes,
+         but a list containing the ranks of the participating processes.
+       - if the root is not part of this list, we add him to the list
+    */
+
+    int childs = 0;
+    int rank, size, mask=1;
+    int index, remote, found;
+    int rootpos;
+    struct mca_coll_hierarch_topo *topo=&(data->hier_topo);
+
+    MCA_COLL_HIERARCH_IS_ROOT_LLEADER (root, data->hier_lleaders,
+				       data->hier_num_lleaders, found,
+				       rootpos);
+    
+    if (found) {
+	size = data->hier_num_lleaders;
+    }
+    else {
+	size = data->hier_num_lleaders + 1;	
+	data->hier_lleaders[data->hier_num_lleaders] = root;
+    }
+    rank = data->hier_my_lleader;
+
+    /* allocate the array of childprocesses, if not yet done */
+    if ( NULL == topo->topo_next && 0 == topo->topo_maxsize ) {
+	topo->topo_next = (int *) malloc (data->hier_num_lleaders+1 * sizeof(int));
+	if ( NULL != topo->topo_next ) {
+	    return;
+	}
+	topo->topo_maxsize=data->hier_num_lleaders+1;
+    }
+
+    index = rank - rootpos;
+
+    if( index < 0 ) index += size;
+    while( mask <= index ) mask <<= 1;
+    
+    /* Determine the rank of my father */
+    if( rootpos == rank ) {
+	topo->topo_prev = root;
+    }
+    else {
+	remote = (index ^ (mask >> 1)) + rootpos;
+	if( remote >= size ) {
+	    remote -= size;
+	}
+	topo->topo_prev = data->hier_lleaders[remote];
+    }
+
+    /* And now let's fill my childs */
+    while( mask < size ) {
+	remote = (index ^ mask);
+	if( remote >= size ) break;
+	remote += rootpos;
+	if( remote >= size ) remote -= size;
+	topo->topo_next[childs] = data->hier_lleaders[remote];
+	mask <<= 1;
+	childs++;
+    }
+
+    topo->topo_nextsize = childs;
+    return;
+}
+
+#endif
--- a/src/mca/coll/hierarch/coll_hierarch_reduce.c
+++ b/src/mca/coll/hierarch/coll_hierarch_reduce.c
@ -43,19 +43,3 @@ int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count,
 }


-/*
- *	reduce_log_inter
- *
- *	Function:	- reduction using O(N) algorithm
- *	Accepts:	- same as MPI_Reduce()
- *	Returns:	- MPI_SUCCESS or error code
- */
-int mca_coll_hierarch_reduce_inter(void *sbuf, void *rbuf, int count,
-                               struct ompi_datatype_t *dtype, 
-                               struct ompi_op_t *op,
-                               int root, struct ompi_communicator_t *comm)
-{
-  ompi_output_verbose(10, mca_coll_base_output, "In hierarch reduce_inter");
-  return comm->c_coll_basic_module->coll_reduce(sbuf, rbuf, count, dtype,
-                                                op, root, comm);
-}