diff --git a/src/mca/coll/hierarch/coll_hierarch.c b/src/mca/coll/hierarch/coll_hierarch.c index c55df45f87..fef5c27ba7 100644 --- a/src/mca/coll/hierarch/coll_hierarch.c +++ b/src/mca/coll/hierarch/coll_hierarch.c @@ -40,6 +40,7 @@ static void mca_coll_hierarch_checkfor_component (struct ompi_communicator_t *comm, char *component_name, int *key, int *done ); +static void mca_coll_hierarch_dump_struct ( struct mca_coll_base_comm_t *c); /* * Linear set of collective algorithms @@ -129,17 +130,11 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority, mca_coll_hierarch_checkfor_component ( comm, "sm", &color, &ncount); -#define KNOW_HOW_TO_CALL_ALLGATHER -#ifdef KNOW_HOW_TO_CALL_ALLGATHER comm->c_coll_basic_module->coll_allreduce (&ncount, &maxncount, 1, MPI_INT, MPI_MAX, comm ); - comm->c_coll_basic_module->coll_allgather (&color, 1, MPI_INT, colorarr, 1, MPI_INT, comm ); -#else - maxncount = ncount; -#endif if ( 1 == maxncount ) { /* * this means, no process has a partner to which it can talk with 'sm', @@ -197,7 +192,7 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm) data->hier_llcomm = llcomm; data->hier_num_reqs = 2 * size; - data->hier_reqs = (ompi_request_t **) malloc (sizeof(ompi_request_t)*2*size); + data->hier_reqs = (ompi_request_t **) malloc (sizeof(ompi_request_t)*size*2); if ( NULL == data->hier_reqs ) { goto exit; } @@ -236,7 +231,9 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm) } data->hier_num_lleaders = c-1; - data->hier_lleaders = (int *) malloc ( sizeof(int) * data->hier_num_lleaders); + /* we allocate one more element than required to be able to add the + root of an operation to this list */ + data->hier_lleaders = (int *) malloc ( sizeof(int) * data->hier_num_lleaders + 1); if ( NULL == data->hier_lleaders ) { goto exit; } @@ -260,6 +257,7 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm) if ( NULL != data->hier_lleaders ) { free ( data->hier_lleaders); } + free ( data ); } return NULL; @@ -283,6 +281,9 @@ int mca_coll_hierarch_module_finalize(struct ompi_communicator_t *comm) ompi_comm_free (&llcomm); free ( data->hier_reqs); free ( data->hier_lleaders); + if ( NULL != data->hier_topo.topo_next ) { + free (data->hier_topo.topo_next); + } free ( data ); comm->c_coll_selected_data = NULL; @@ -326,7 +327,7 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm, for ( i=0; iproc_ptl_next); listsize = mca_ptl_array_get_size(&proc->proc_ptl_next); #else @@ -356,9 +357,6 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm, /* check for the required component */ if (! strcmp (ptr->ptlm_version.mca_component_name, component_name)){ -#ifdef VERBOSE - printf("found component %s for rank %d \n", component_name, i ); -#endif counter++; if (ihier_comm->c_name, c->hier_comm->c_contextid); + printf(" no of lleaders %d my_leader %d am_leader %d\n", + c->hier_num_lleaders, c->hier_my_lleader, c->hier_am_lleader ); + for (i=0; ihier_num_lleaders; i++ ) { + printf(" lleader[%d] = %d\n", i, c->hier_lleaders[i]); + } + + return; +} diff --git a/src/mca/coll/hierarch/coll_hierarch.h b/src/mca/coll/hierarch/coll_hierarch.h index 12f5954056..e19bb7e70e 100644 --- a/src/mca/coll/hierarch/coll_hierarch.h +++ b/src/mca/coll/hierarch/coll_hierarch.h @@ -39,7 +39,17 @@ extern int mca_coll_hierarch_verbose; /* * Data structure for attaching data to the communicator */ + + struct mca_coll_hierarch_topo { + int topo_root; + int topo_prev; + int topo_nextsize; + int topo_maxsize; + int *topo_next; + }; + struct mca_coll_base_comm_t { + struct ompi_communicator_t *hier_comm; /* link back to the attached comm */ struct ompi_communicator_t *hier_llcomm; /* low level communicator */ int hier_num_lleaders; /* number of local leaders */ int *hier_lleaders; /* list of local leaders */ @@ -47,10 +57,22 @@ extern int mca_coll_hierarch_verbose; int hier_am_lleader; /* am I an lleader? */ int hier_num_reqs; /* num. of requests */ ompi_request_t **hier_reqs; /* list of requests */ + struct mca_coll_hierarch_topo hier_topo; /* topology used in the coll ops */ }; +#define MCA_COLL_HIERARCH_IS_ROOT_LLEADER(_root, _lls, _llsize, _found, _pos) { \ + int _i; \ + for (_found=0, _pos=_llsize, _i=0; _i<_llsize; _i++) { \ + if ( _lls[_i] == _root ) { \ + _found = 1; \ + _pos = _i; \ + break; \ + } \ + } \ + } + /* * coll API functions */ diff --git a/src/mca/coll/hierarch/coll_hierarch_barrier.c b/src/mca/coll/hierarch/coll_hierarch_barrier.c index f0942002a7..6bb5bb1f89 100644 --- a/src/mca/coll/hierarch/coll_hierarch_barrier.c +++ b/src/mca/coll/hierarch/coll_hierarch_barrier.c @@ -37,15 +37,3 @@ int mca_coll_hierarch_barrier_intra(struct ompi_communicator_t *comm) } -/* - * barrier_inter - * - * Function: - barrier using O(log(N)) algorithm - * Accepts: - same as MPI_Barrier() - * Returns: - MPI_SUCCESS or error code - */ -int mca_coll_hierarch_barrier_inter(struct ompi_communicator_t *comm) -{ - ompi_output_verbose(10, mca_coll_base_output, "In hierarch barrier_inter"); - return comm->c_coll_basic_module->coll_barrier(comm); -} diff --git a/src/mca/coll/hierarch/coll_hierarch_bcast.c b/src/mca/coll/hierarch/coll_hierarch_bcast.c index 6e8bf99df1..9a1b4480b8 100644 --- a/src/mca/coll/hierarch/coll_hierarch_bcast.c +++ b/src/mca/coll/hierarch/coll_hierarch_bcast.c @@ -23,7 +23,22 @@ #include "mca/coll/base/coll_tags.h" #include "coll_hierarch.h" +static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, + int count, + ompi_datatype_t * datatype, + int root, + ompi_communicator_t * comm, + int segsize, + struct mca_coll_hierarch_topo *topo); +static int mca_coll_hierarch_intra_bcast_setup_topo (int count, + ompi_datatype_t *datatype, + int root, + struct mca_coll_base_comm_t *data, + int *segsize); +static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data ); + +#ifdef SIMPLE_HIERARCH_BCAST /* * bcast_intra * @@ -31,8 +46,10 @@ * Accepts: - same arguments as MPI_Bcast() * Returns: - MPI_SUCCESS or error code */ -int mca_coll_hierarch_bcast_intra(void *buff, int count, - struct ompi_datatype_t *datatype, int root, +int mca_coll_hierarch_bcast_intra(void *buff, + int count, + struct ompi_datatype_t *datatype, + int root, struct ompi_communicator_t *comm) { struct mca_coll_base_comm_t *data=NULL; @@ -47,7 +64,7 @@ int mca_coll_hierarch_bcast_intra(void *buff, int count, need something significantly better */ if ( rank == root ) { for (i=0; i< data->hier_num_lleaders; i++) { - if ( data->hier_lleaders[i] == data->hier_my_lleader ) { + if ( data->hier_lleaders[i] == root ) { data->hier_reqs[i] = MPI_REQUEST_NULL; continue; } @@ -59,13 +76,16 @@ int mca_coll_hierarch_bcast_intra(void *buff, int count, return ret; } } + ret = ompi_request_wait_all ( data->hier_num_lleaders, data->hier_reqs, MPI_STATUSES_IGNORE); if ( OMPI_SUCCESS != ret ) { return ret; } + } - else if ( data->hier_am_lleader ) { + + if ( data->hier_am_lleader ) { ret = mca_pml.pml_recv ( buff, count, datatype, root, MCA_COLL_BASE_TAG_BCAST, comm, MPI_STATUS_IGNORE ); @@ -74,6 +94,9 @@ int mca_coll_hierarch_bcast_intra(void *buff, int count, } } + /* once the local leaders got the data from the root, they can distribute + it to the processes in their local, low-leve communicator. + */ if ( MPI_COMM_NULL != llcomm ) { ret = llcomm->c_coll.coll_bcast(buff, count, datatype, data->hier_my_lleader, llcomm ); @@ -81,3 +104,299 @@ int mca_coll_hierarch_bcast_intra(void *buff, int count, return ret; } + +#else +int mca_coll_hierarch_bcast_intra(void *buff, + int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm) +{ + struct mca_coll_base_comm_t *data=NULL; + struct ompi_communicator_t *llcomm=NULL; + int rank, ret; + int segsize; + + rank = ompi_comm_rank ( comm ); + data = comm->c_coll_selected_data; + llcomm = data->hier_llcomm; + + if ( rank == root || data->hier_am_lleader ) { + /* this functions sets up the topology used in the segmented + bcast afterwards and determines the segment size. */ + ret = mca_coll_hierarch_intra_bcast_setup_topo (count, datatype, root, data, + &segsize); + if ( OMPI_SUCCESS != ret ) { + return ret; + } + /* ok, do now the actual bcast. Hopefully, this routine will come + out of Jelena's collective module in the end. For the moment, + I've implemented it myself + */ + ret = mca_coll_hierarch_intra_segmented_bcast (buff, count, + datatype, root, + comm, segsize, + &(data->hier_topo)); + if ( OMPI_SUCCESS != ret ) { + return ret; + } + } + + /* once the local leaders got the data from the root, they can distribute + it to the processes in their local, low-leve communicator. + */ + if ( MPI_COMM_NULL != llcomm ) { + ret = llcomm->c_coll.coll_bcast(buff, count, datatype, + data->hier_my_lleader, llcomm ); + } + + return ret; +} + +/* + * This is the mother of all segmented bcast algorithms of any type. + * Due to the general structure of the topo argument, you can use this function + * for any type of algorith - it just depends on the settings of topo. + * + * The implementation is strongly leaning on the implementation in FT-MPI. + */ + +static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer, + int count, + ompi_datatype_t * datatype, + int root, + ompi_communicator_t * comm, + int segsize, + struct mca_coll_hierarch_topo *topo) +{ + int err=0, i, j; + int size, rank; + int segcount; /* Number of elements sent with each segment */ + int num_segments; /* Number of segmenets */ + int recvcount; /* the same like segcount, except for the last segment */ + int typelng, realsegsize; + char *tmpbuf; + long rlb, ext; + ompi_request_t ** recv_request= NULL; + + size = ompi_comm_size ( comm ); + rank = ompi_comm_rank ( comm ); + + /* ------------------------------------------- */ + /* special case for size == 1 and 2 */ + if (size == 1) { + return OMPI_SUCCESS; + } + if (size == 2) { + if (rank == root) { + err = mca_pml.pml_send(buffer, count, datatype, (rank+1)%2, + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm ); + if ( OMPI_SUCCESS != err ) { + return err; + } + } else { + err = mca_pml.pml_recv(buffer, count, datatype, root, + MCA_COLL_BASE_TAG_BCAST, comm, + MPI_STATUS_IGNORE); + if ( OMPI_SUCCESS != err) { + return err; + } + } + return OMPI_SUCCESS; + } + /* end special case for size == 1 and 2 */ + + + tmpbuf = (char *) buffer; + /* -------------------------------------------------- */ + /* Determine number of segments and number of elements + sent per operation */ + err = ompi_ddt_type_size( datatype, &typelng); + if ( OMPI_SUCCESS != err) { + return ( err ); + } + + if ( segsize > 0 ) { + segcount = segsize/typelng; + num_segments = count/segcount; + if (0 != (count % segcount)) { + num_segments++; + } + } + else { + segcount = count; + num_segments = 1; + } + + /* Determine real segment size = segcount * extent */ + err = ompi_ddt_get_extent( datatype, &rlb, &ext ); + if ( OMPI_SUCCESS != err) { + return ( err ); + } + realsegsize = segcount*ext; + + /* ----------------------------------------------------- */ + /* Post Irecv if not root-node */ + if (rank != root) { + /* has a parent. need to receive before sending */ + recv_request = (MPI_Request*)malloc ( sizeof(ompi_request_t *)*num_segments ); + + for( i = 0; i < num_segments; i++) { + if ( i == (num_segments -1) ) { + recvcount = count - (segcount * i); + } + else { + recvcount = segcount; + } + err = mca_pml.pml_irecv(tmpbuf+i*realsegsize, recvcount, datatype, + topo->topo_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &recv_request[i]); + if ( OMPI_SUCCESS != err ) { + return ( err ); + } + } + } + + /* ---------------------------------------------- */ + /* If leaf node, just finish the receive */ + if (topo->topo_nextsize == 0) { + if(recv_request != NULL) { + err = ompi_request_wait_all (num_segments, recv_request, MPI_STATUSES_IGNORE); + if ( OMPI_SUCCESS != err ) { + return ( err ); + } + } + } + else { + /* ------------------------------------------ */ + /* root or intermediate node */ + for( i = 0; i < num_segments; i++) { + if (rank != root) { + /* intermediate nodes have to wait for the completion of + the corresponding receive */ + err = ompi_request_wait_all(1, &recv_request[i], MPI_STATUS_IGNORE); + if ( OMPI_SUCCESS != err ) { + return ( err ); + } + } + for ( j = 0; j < topo->topo_nextsize; j++) { + if ( i == ( num_segments - 1 )) { + recvcount = count - ( segcount * i); + } + else { + recvcount = segcount; + } + + err = mca_pml.pml_send(tmpbuf+i*realsegsize, recvcount, + datatype, topo->topo_next[j], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm ); + if( OMPI_SUCCESS != err ) { + return ( err ); + } + } /* for ( j = 0; j < topo_nextsize; j++) */ + } /* for ( i = 0; i < num_segments; i++) */ + } + + if(recv_request != NULL) { + free(recv_request); + } + + return OMPI_SUCCESS; +} + + +/* + * This routine does the magic to determine, which topology (bmtree, linear, chain etc) + * would perform best in this scenario. At the moment, we just do bmtree. + * + * The implementation is once again strongly related to the version in FT-MPI. + */ +static int mca_coll_hierarch_intra_bcast_setup_topo (int count, + ompi_datatype_t *datatype, + int root, + struct mca_coll_base_comm_t *data, + int *segsize) +{ + /* without spending time on that issues, I set for the moment segsize to 32k. */ + *segsize = 32768; + + /* without spending time on that issue, I set the topology to a binomial tree */ + setup_topo_bmtree ( root, data ); + + return OMPI_SUCCESS; +} + + +static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data ) +{ + /* This implementation is based on the closest first bmtree algorithms + in FT-MPI implemnented by George/Jelena, has however a couple of + significant modifications: + - we are not having a contiguous list of participating processes, + but a list containing the ranks of the participating processes. + - if the root is not part of this list, we add him to the list + */ + + int childs = 0; + int rank, size, mask=1; + int index, remote, found; + int rootpos; + struct mca_coll_hierarch_topo *topo=&(data->hier_topo); + + MCA_COLL_HIERARCH_IS_ROOT_LLEADER (root, data->hier_lleaders, + data->hier_num_lleaders, found, + rootpos); + + if (found) { + size = data->hier_num_lleaders; + } + else { + size = data->hier_num_lleaders + 1; + data->hier_lleaders[data->hier_num_lleaders] = root; + } + rank = data->hier_my_lleader; + + /* allocate the array of childprocesses, if not yet done */ + if ( NULL == topo->topo_next && 0 == topo->topo_maxsize ) { + topo->topo_next = (int *) malloc (data->hier_num_lleaders+1 * sizeof(int)); + if ( NULL != topo->topo_next ) { + return; + } + topo->topo_maxsize=data->hier_num_lleaders+1; + } + + index = rank - rootpos; + + if( index < 0 ) index += size; + while( mask <= index ) mask <<= 1; + + /* Determine the rank of my father */ + if( rootpos == rank ) { + topo->topo_prev = root; + } + else { + remote = (index ^ (mask >> 1)) + rootpos; + if( remote >= size ) { + remote -= size; + } + topo->topo_prev = data->hier_lleaders[remote]; + } + + /* And now let's fill my childs */ + while( mask < size ) { + remote = (index ^ mask); + if( remote >= size ) break; + remote += rootpos; + if( remote >= size ) remote -= size; + topo->topo_next[childs] = data->hier_lleaders[remote]; + mask <<= 1; + childs++; + } + + topo->topo_nextsize = childs; + return; +} + +#endif diff --git a/src/mca/coll/hierarch/coll_hierarch_reduce.c b/src/mca/coll/hierarch/coll_hierarch_reduce.c index 15da5ffcad..5867578d3a 100644 --- a/src/mca/coll/hierarch/coll_hierarch_reduce.c +++ b/src/mca/coll/hierarch/coll_hierarch_reduce.c @@ -43,19 +43,3 @@ int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count, } -/* - * reduce_log_inter - * - * Function: - reduction using O(N) algorithm - * Accepts: - same as MPI_Reduce() - * Returns: - MPI_SUCCESS or error code - */ -int mca_coll_hierarch_reduce_inter(void *sbuf, void *rbuf, int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, struct ompi_communicator_t *comm) -{ - ompi_output_verbose(10, mca_coll_base_output, "In hierarch reduce_inter"); - return comm->c_coll_basic_module->coll_reduce(sbuf, rbuf, count, dtype, - op, root, comm); -}