diff --git a/ompi/mca/coll/tuned/.ompi_ignore b/ompi/mca/coll/tuned/.ompi_ignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ompi/mca/coll/tuned/.ompi_unignore b/ompi/mca/coll/tuned/.ompi_unignore new file mode 100644 index 0000000000..8ad2b972c3 --- /dev/null +++ b/ompi/mca/coll/tuned/.ompi_unignore @@ -0,0 +1 @@ +gef diff --git a/ompi/mca/coll/tuned/Makefile.am b/ompi/mca/coll/tuned/Makefile.am new file mode 100644 index 0000000000..5530c0a0de --- /dev/null +++ b/ompi/mca/coll/tuned/Makefile.am @@ -0,0 +1,49 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University. +# All rights reserved. +# Copyright (c) 2004-2005 The Trustees of the University of Tennessee. +# All rights reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Use the top-level Makefile.options + +include $(top_ompi_srcdir)/config/Makefile.options + +sources = \ + coll_tuned.h \ + coll_tuned_topo.h \ + coll_tuned_topo.c \ + coll_tuned_bcast_decision.c \ + coll_tuned_bcast.c \ + coll_tuned_component.c \ + coll_tuned_module.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_coll_tuned_DSO +component_noinst = +component_install = mca_coll_tuned.la +else +component_noinst = libmca_coll_tuned.la +component_install = +endif + +mcacomponentdir = $(libdir)/openmpi +mcacomponent_LTLIBRARIES = $(component_install) +mca_coll_tuned_la_SOURCES = $(sources) +mca_coll_tuned_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_coll_tuned_la_SOURCES =$(sources) +libmca_coll_tuned_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h new file mode 100644 index 0000000000..7aec526093 --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_TUNED_EXPORT_H +#define MCA_COLL_TUNED_EXPORT_H + +#include "ompi_config.h" + +#include "mpi.h" +#include "mca/mca.h" +#include "mca/coll/coll.h" +#include "request/request.h" +#include "mca/pml/pml.h" + +/* need to include our own topo prototypes so we can malloc data on the comm correctly */ +#include "coll_tuned_topo.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/* + * Globally exported variable + */ + +OMPI_COMP_EXPORT extern const mca_coll_base_component_1_0_0_t mca_coll_tuned_component; +OMPI_COMP_EXPORT extern int mca_coll_tuned_priority_param; + + +/* + * coll API functions + */ + + + /* API functions */ + + int mca_coll_tuned_init_query(bool enable_progress_threads, + bool enable_mpi_threads); + const struct mca_coll_base_module_1_0_0_t * + mca_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority, + struct mca_coll_base_comm_t **data); + + const struct mca_coll_base_module_1_0_0_t * + mca_coll_tuned_module_init(struct ompi_communicator_t *comm); + int mca_coll_tuned_module_finalize(struct ompi_communicator_t *comm); + + /* API functions of decision functions and any implementations */ + + int mca_coll_tuned_allgather_intra_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm); + int mca_coll_tuned_allgather_inter_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_allgatherv_intra_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void * rbuf, int *rcounts, int *disps, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm); + int mca_coll_tuned_allgatherv_inter_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void * rbuf, int *rcounts, int *disps, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_allreduce_intra_dec(void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm); + int mca_coll_tuned_allreduce_inter_dec(void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_alltoall_intra_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm); + int mca_coll_tuned_alltoall_inter_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_alltoallv_intra_dec(void *sbuf, int *scounts, int *sdisps, + struct ompi_datatype_t *sdtype, + void *rbuf, int *rcounts, int *rdisps, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm); + int mca_coll_tuned_alltoallv_inter_dec(void *sbuf, int *scounts, int *sdisps, + struct ompi_datatype_t *sdtype, + void *rbuf, int *rcounts, int *rdisps, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_alltoallw_intra_dec(void *sbuf, int *scounts, int *sdisps, + struct ompi_datatype_t **sdtypes, + void *rbuf, int *rcounts, int *rdisps, + struct ompi_datatype_t **rdtypes, + struct ompi_communicator_t *comm); + int mca_coll_tuned_alltoallw_inter_dec(void *sbuf, int *scounts, int *sdisps, + struct ompi_datatype_t **sdtypes, + void *rbuf, int *rcounts, int *rdisps, + struct ompi_datatype_t **rdtypes, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_barrier_intra_dec(struct ompi_communicator_t *comm); + int mca_coll_tuned_barrier_inter_dec(struct ompi_communicator_t *comm); + + + int mca_coll_tuned_bcast_intra_dec(void *buff, int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_bcast_intra_linear(void *buff, int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_bcast_intra_chain(void *buff, int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm, + uint32_t segsize, int32_t chains); + + int mca_coll_tuned_bcast_intra_pipeline(void *buff, int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm, + uint32_t segsize); + + int mca_coll_tuned_bcast_intra_bmtree(void *buff, int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm, + uint32_t segsize, int32_t chains); + + int mca_coll_tuned_bcast_intra_bintree(void *buff, int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm, + uint32_t segsize); + + + + + int mca_coll_tuned_bcast_inter_dec(void *buff, int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_exscan_intra_dec(void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm); + int mca_coll_tuned_exscan_inter_dec(void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_gather_intra_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, void *rbuf, + int rcount, struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm); + int mca_coll_tuned_gather_inter_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, void *rbuf, + int rcount, struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm); + + int mca_coll_tuned_gatherv_intra_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, void *rbuf, + int *rcounts, int *disps, + struct ompi_datatype_t *rdtype, int root, + struct ompi_communicator_t *comm); + int mca_coll_tuned_gatherv_inter_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, void *rbuf, + int *rcounts, int *disps, + struct ompi_datatype_t *rdtype, int root, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_reduce_intra_dec(void *sbuf, void* rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm); + int mca_coll_tuned_reduce_inter_dec(void *sbuf, void* rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_reduce_scatter_intra_dec(void *sbuf, void *rbuf, + int *rcounts, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm); + int mca_coll_tuned_reduce_scatter_inter_dec(void *sbuf, void *rbuf, + int *rcounts, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_scan_intra_dec(void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm); + int mca_coll_tuned_scan_inter_dec(void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm); + + int mca_coll_tuned_scatter_intra_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, void *rbuf, + int rcount, struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm); + int mca_coll_tuned_scatter_inter_dec(void *sbuf, int scount, + struct ompi_datatype_t *sdtype, void *rbuf, + int rcount, struct ompi_datatype_t *rdtype, + int root, struct ompi_communicator_t *comm); + + int mca_coll_tuned_scatterv_intra_dec(void *sbuf, int *scounts, int *disps, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, int root, + struct ompi_communicator_t *comm); + int mca_coll_tuned_scatterv_inter_dec(void *sbuf, int *scounts, int *disps, + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, int root, + struct ompi_communicator_t *comm); + + + +/* Utility functions */ + +static inline void mca_coll_tuned_free_reqs(ompi_request_t **reqs, int count) +{ + int i; + for (i = 0; i < count; ++i) + ompi_request_free(&reqs[i]); +} + +/* decision table declaraion */ +/* currently a place holder */ +typedef struct rule_s { +} rule_t; + +/* + * Data structure for hanging data off the communicator + */ +struct mca_coll_base_comm_t { + /* standard data for requests and PML usage */ + + /* we need to keep this here for now incase we fall through to the basic functions + * that expect these fields/and memory to be avaliable (GEF something for JS?) + */ + ompi_request_t **mccb_reqs; + int mccb_num_reqs; + + + /* + * tuned topo information caching per communicator + * + * for each communicator we cache the topo information so we can reuse without regenerating + * if we change the root, [or fanout] then regenerate and recache this information + * + */ + + ompi_coll_tree_t *cached_tree; + int cached_tree_root; + int cached_tree_fanout; + + ompi_coll_bmtree_t *cached_bmtree; + int cached_bmtree_root; + + ompi_coll_chain_t *cached_chain; + int cached_chain_root; + int cached_chain_fanout; + + /* extra data required by the decision functions */ + rule_t* decision_table; +}; + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif /* MCA_COLL_TUNED_EXPORT_H */ diff --git a/ompi/mca/coll/tuned/coll_tuned_bcast.c b/ompi/mca/coll/tuned/coll_tuned_bcast.c new file mode 100644 index 0000000000..ce8771391b --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned_bcast.c @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "coll_tuned.h" + +#include "mpi.h" +#include "ompi/include/constants.h" +#include "datatype/datatype.h" +#include "communicator/communicator.h" +#include "mca/coll/coll.h" +#include "mca/coll/base/coll_tags.h" +#include "mca/pml/pml.h" + +/* external prototype we need */ +extern int ompi_mpi_sendrecv( void* sendbuf, int scount, ompi_datatype_t* sdatatype, + int dest, int stag, + void* recvbuf, int rcount, ompi_datatype_t* rdata, + int source, int rtag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status ); + +int +mca_coll_tuned_bcast_intra_chain ( void *buff, int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm, + uint32_t segsize, int32_t chains ) +{ + int err = 0, line, rank, size, segindex, i; + int segcount; /* Number of elements sent with each segment */ + int num_segments; /* Number of segmenets */ + int sendcount; /* the same like segcount, except for the last segment */ + int new_sendcount; /* used to mane the size for the next pipelined receive */ + int realsegsize; + char *tmpbuf = (char*)buff; + long ext; + int typelng; + MPI_Request base_req, new_req; + ompi_coll_chain_t* chain; + + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); + + printf("mca_coll_tuned_bcast_intra_chain rank %d root %d\n", rank, root); + + if( size == 1 ) { + return MPI_SUCCESS; + } + + /* + * setup the chain topology. + * if the previous chain topology is the same, then use this cached copy + * other wise recreate it. + */ + + if ((comm->c_coll_selected_data->cached_chain) && (comm->c_coll_selected_data->cached_chain_root == root) + && (comm->c_coll_selected_data->cached_chain_fanout == chains)) { + chain = comm->c_coll_selected_data->cached_chain; + } + else { + if (comm->c_coll_selected_data->cached_chain) { /* destroy previous chain if defined */ + ompi_coll_tuned_topo_destroy_chain (&comm->c_coll_selected_data->cached_chain); + } + comm->c_coll_selected_data->cached_chain = chain = ompi_coll_tuned_topo_build_chain( chains, comm, root ); + comm->c_coll_selected_data->cached_chain_root = root; + comm->c_coll_selected_data->cached_chain_fanout = chains; + } + + ompi_ddt_type_size( datatype, &typelng ); + + /* Determine number of segments and number of elements + * sent per operation */ + if( segsize == 0 ) { + /* no segmentation */ + segcount = count; + num_segments = 1; + } else { + /* segment the message */ + segcount = segsize / typelng; + num_segments = count / segcount; + if ((count % segcount)!= 0) num_segments++; + } + realsegsize = segcount*ext; + /* set the buffer pointer */ + tmpbuf = (char *)buff; + + /* root code */ + if( rank == root ) { + /* for each segment */ + sendcount = segcount; + for (segindex = 0; segindex < num_segments; segindex++) { + /* determine how many elements are being sent in this round */ + if( segindex == (num_segments - 1) ) + sendcount = count - segindex*segcount; + for( i = 0; i < chain->chain_nextsize; i++ ) { + err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype, + chain->chain_next[i], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD,comm)); + if( MPI_SUCCESS != err ) { line = __LINE__; goto error_hndl; } + } + /* update tmp buffer */ + tmpbuf += realsegsize; + } + } + + /* intermediate nodes code */ + else if (chain->chain_nextsize > 0) { + /* Create the pipeline. We first post the first receive, then in the loop we + * post the next receive and after that wait for the previous receive to + * complete and we disseminating the data to all children. + */ + new_sendcount = sendcount = segcount; + err = MCA_PML_CALL(recv( tmpbuf, sendcount, datatype, + chain->chain_prev, MCA_COLL_BASE_TAG_BCAST, + comm, MPI_STATUS_IGNORE)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + for( segindex = 1; segindex < num_segments; segindex++ ) { + /* determine how many elements to expect in this round */ + if( segindex == (num_segments - 1)) + new_sendcount = count - segindex*segcount; + /* post new irecv */ + err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, new_sendcount, + datatype, chain->chain_prev, + MCA_COLL_BASE_TAG_BCAST, comm, &new_req)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + /* wait for and forward current segment */ + err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE ); + for( i = 0; i < chain->chain_nextsize; i++ ) { + /* send data to children */ + err = MCA_PML_CALL(send( tmpbuf, sendcount, datatype, + chain->chain_next[i], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } /* end of for each child */ + /* upate the base request */ + base_req = new_req; + /* go to the next buffer (ie. the one corresponding to the next recv) */ + tmpbuf += realsegsize; + sendcount = new_sendcount; + } /* end of for segindex */ + + /* wait for the last segment and forward current segment */ + err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE ); + for( i = 0; i < chain->chain_nextsize; i++ ) { + /* send data to children */ + err = MCA_PML_CALL(send( tmpbuf, sendcount, datatype, + chain->chain_next[i], + MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } /* end of for each child */ + } + + /* leaf nodes */ + else { + sendcount = segcount; + for (segindex = 0; segindex < num_segments; segindex++) { + /* determine how many elements to expect in this round */ + if (segindex == (num_segments - 1)) + sendcount = count - segindex*segcount; + /* receive segments */ + err = MCA_PML_CALL(recv( tmpbuf, sendcount, datatype, + chain->chain_prev, MCA_COLL_BASE_TAG_BCAST, + comm, MPI_STATUS_IGNORE)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + /* update the initial pointer to the buffer */ + tmpbuf += realsegsize; + } + } + + return (MPI_SUCCESS); + error_hndl: + fprintf(stderr,"%s:%d: Error %d occurred\n",__FILE__,line,err); + return (err); +} + +int +mca_coll_tuned_bcast_intra_pipeline ( void *buffer, + int count, + struct ompi_datatype_t *datatype, + int root, + struct ompi_communicator_t *comm, + uint32_t segsize ) +{ + int rank; /* remove when removing print statement */ + rank = ompi_comm_rank(comm); /* remove when removing print statement */ + printf("mca_coll_tuned_bcast_intra_pipeline rank %d root %d\n", rank, root); + + return mca_coll_tuned_bcast_intra_chain ( buffer, count, datatype, root, comm, + segsize, 1 ); +} + + +int +mca_coll_tuned_bcast_intra_bintree ( void* buffer, + int count, + struct ompi_datatype_t* datatype, + int root, + struct ompi_communicator_t* comm, + uint32_t segsize ) +{ + int err=0, line; + int rank, size; + int segindex, i, lr, pair; + int segcount[2]; /* Number of elements sent with each segment */ + uint32_t counts[2]; + int num_segments[2]; /* Number of segmenets */ + int sendcount[2]; /* the same like segcount, except for the last segment */ + int realsegsize[2]; + char *tmpbuf[2]; + int type_size; + long type_extent; + MPI_Request base_req, new_req; + ompi_coll_tree_t *tree; + + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); + + printf("mca_coll_tuned_bcast_intra_bintree rank %d root %d\n", rank, root); + + if (size == 1) { + return MPI_SUCCESS; + } + + /* + * setup the tree topology. + * if the previous tree topology is the same, then use this cached copy + * other wise recreate it. + */ + + if ((comm->c_coll_selected_data->cached_tree) && (comm->c_coll_selected_data->cached_tree_root == root) + && (comm->c_coll_selected_data->cached_tree_fanout == 2)) { + tree = comm->c_coll_selected_data->cached_tree; + } + else { + if (comm->c_coll_selected_data->cached_tree) { /* destroy previous chain if defined */ + ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_tree); + } + comm->c_coll_selected_data->cached_tree = tree = ompi_coll_tuned_topo_build_tree( 2, comm, root ); + comm->c_coll_selected_data->cached_tree_root = root; + comm->c_coll_selected_data->cached_tree_fanout = 2; + } + + + err = ompi_ddt_type_size( datatype, &type_size ); + + /* Determine number of segments and number of elements per segment */ + counts[0] = count/2; + if (count % 2 != 0) counts[0]++; + counts[1] = count - counts[0]; + if ( segsize > 0 ) { + segcount[0] = segcount[1] = segsize / type_size; + num_segments[0] = counts[0]/segcount[0]; + if ((counts[0] % segcount[0]) != 0) num_segments[0]++; + num_segments[1] = counts[1]/segcount[1]; + if ((counts[1] % segcount[1]) != 0) num_segments[1]++; + } else { + segcount[0] = counts[0]; + segcount[1] = counts[1]; + num_segments[0] = num_segments[1] = 1; + } + + /* if the message is too small to be split into segments */ + if( (counts[0] == 0 || counts[1] == 0) || + (segsize > counts[0] * type_size) || + (segsize > counts[1] * type_size) ) { + /* call linear version here ! */ + return (mca_coll_tuned_bcast_intra_bintree( buffer, count, datatype, + root, comm, segsize )); + } + + /* Determine real segment size */ + realsegsize[0] = segcount[0] * type_extent; + realsegsize[1] = segcount[1] * type_extent; + + /* set the buffer pointers */ + tmpbuf[0] = (char *) buffer; + tmpbuf[1] = (char *) buffer+counts[0] * type_extent; + + /* Step 1: + Root splits the buffer in 2 and sends segmented message down the branches. + Left subtree of the tree receives first half od the buffer, while right + subtree receives the remaining message. + */ + + /* determine if I am left (0) or right (1), (root is right) */ + lr = ((rank + size - root)%size + 1)%2; + + /* root code */ + if( rank == root ) { + /* determine segment count */ + sendcount[0] = segcount[0]; + sendcount[1] = segcount[1]; + /* for each segment */ + for (segindex = 0; segindex < num_segments[0]; segindex++) { + /* for each child */ + for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) { + if (segindex >= num_segments[i]) { /* no more segments */ + continue; + } + /* determine how many elements are being sent in this round */ + if(segindex == (num_segments[i] - 1)) + sendcount[i] = counts[i] - segindex*segcount[i]; + /* send data */ + MCA_PML_CALL(send(tmpbuf[i], sendcount[i], datatype, + tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + /* update tmp buffer */ + tmpbuf[i] += realsegsize[i]; + } + } + } + + /* intermediate nodes code */ + else if( tree->tree_nextsize > 0 ) { + /* Intermediate nodes: + * It will receive segments only from one half od the data. + * Which one is determined by whether the node belongs to the "left" or "right" + * subtree. Topoloby building function builds binary tree such that + * odd "shifted ranks" ((rank + size - root)%size) are on the left subtree, + * and even on the right subtree. + * + * Create the pipeline. We first post the first receive, then in the loop we + * post the next receive and after that wait for the previous receive to complete + * and we disseminating the data to all children. + */ + sendcount[lr] = segcount[lr]; + MCA_PML_CALL(irecv(tmpbuf[lr], sendcount[lr], datatype, + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &base_req)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + for( segindex = 1; segindex < num_segments[lr]; segindex++ ) { + /* determine how many elements to expect in this round */ + if( segindex == (num_segments[lr] - 1)) + sendcount[lr] = counts[lr] - segindex*segcount[lr]; + /* post new irecv */ + MCA_PML_CALL(irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr], + datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, &new_req)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + /* wait for and forward current segment */ + err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE ); + for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children (segcount[lr]) */ + MCA_PML_CALL(send( tmpbuf[lr], segcount[lr], datatype, + tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } /* end of for each child */ + + /* upate the base request */ + base_req = new_req; + /* go to the next buffer (ie. the one corresponding to the next recv) */ + tmpbuf[lr] += realsegsize[lr]; + } /* end of for segindex */ + + /* wait for the last segment and forward current segment */ + err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE ); + for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */ + MCA_PML_CALL(send(tmpbuf[lr], sendcount[lr], datatype, + tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } /* end of for each child */ + } + + /* leaf nodes */ + else { + /* Create the pipeline. We first post the first receive, then in the loop we + * post the next receive and after that wait for the previous receive to complete + * and we disseminating the data to all children. + */ + sendcount[lr] = segcount[lr]; + for (segindex = 0; segindex < num_segments[lr]; segindex++) { + /* determine how many elements to expect in this round */ + if (segindex == (num_segments[lr] - 1)) sendcount[lr] = counts[lr] - segindex*segcount[lr]; + /* receive segments */ + MCA_PML_CALL(recv(tmpbuf[lr], sendcount[lr], datatype, + tree->tree_prev, MCA_COLL_BASE_TAG_BCAST, + comm, MPI_STATUS_IGNORE)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + /* update the initial pointer to the buffer */ + tmpbuf[lr] += realsegsize[lr]; + } + } + + /* reset the buffer pointers */ + tmpbuf[0] = (char *) buffer; + tmpbuf[1] = (char *) buffer+counts[0] * type_extent; + + /* Step 2: + Find your immediate pair (identical node in opposite subtree) and SendRecv + data buffer with them. + The tree building function ensures that + if (we are not root) + if we are in the left subtree (lr == 0) our pair is (rank+1)%size. + if we are in the right subtree (lr == 1) our pair is (rank-1)%size + If we have even number of nodes the rank (size-1) will pair up with root. + */ + if (lr == 0) { + pair = (rank+1)%size; + } else { + pair = (rank+size-1)%size; + } + if ( (size%2) != 0 && rank != root) { + + err = ompi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype, + pair, MCA_COLL_BASE_TAG_BCAST, + tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype, + pair, MCA_COLL_BASE_TAG_BCAST, + comm, MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } else if ( (size%2) == 0 ) { + /* root sends right buffer to the last node */ + if( rank == root ) { + MCA_PML_CALL(send(tmpbuf[1], counts[1], datatype, + (root+size-1)%size, MCA_COLL_BASE_TAG_BCAST, + MCA_PML_BASE_SEND_STANDARD, comm)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + + } + /* last node receives right buffer from the root */ + else if (rank == size - 1) { + MCA_PML_CALL(recv(tmpbuf[1], counts[1], datatype, + root, MCA_COLL_BASE_TAG_BCAST, + comm, MPI_STATUS_IGNORE)); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } + /* everyone else exchanges buffers */ + else { + err = ompi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype, + pair, MCA_COLL_BASE_TAG_BCAST, + tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype, + pair, MCA_COLL_BASE_TAG_BCAST, + comm, MPI_STATUS_IGNORE ); + if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } + } + } + return (MPI_SUCCESS); + + error_hndl: + fprintf(stderr,"[%d]%s:%d: Error %d occurred\n",rank,__FILE__,line,err); + return (err); +} + diff --git a/ompi/mca/coll/tuned/coll_tuned_bcast_decision.c b/ompi/mca/coll/tuned/coll_tuned_bcast_decision.c new file mode 100644 index 0000000000..f82a983759 --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned_bcast_decision.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "coll_tuned.h" + +#include "mpi.h" +#include "include/constants.h" +#include "datatype/datatype.h" +#include "communicator/communicator.h" +#include "mca/coll/coll.h" +#include "mca/coll/base/coll_tags.h" +#include "coll_tuned.h" +#include "mca/pml/pml.h" +#include "opal/util/bit_ops.h" + + +/* + * bcast_intra_dec + * + * Function: - seletects broadcast algorithm to use + * Accepts: - same arguments as MPI_Bcast() + * Returns: - MPI_SUCCESS or error code (passed from the bcast implementation) + */ +int mca_coll_tuned_bcast_intra_dec(void *buff, int count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm) +{ + int i; + int size; + int rank; + int err; + int contig; + int dsize; + + printf("mca_coll_tuned_bcast_intra_dec\n"); + + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); + +/* err = mca_coll_tuned_bcast_intra_linear (buff, count, datatype, root, comm); */ +/* err = mca_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, (8192)); */ +/* err = mca_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, (8192)); */ + err = mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, (8192)); + + return err; +} + + diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c new file mode 100644 index 0000000000..aed3e07503 --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "ompi_config.h" +#include "coll_tuned.h" + +#include "mpi.h" +#include "mca/coll/coll.h" +#include "coll_tuned.h" + +/* + * Public string showing the coll ompi_tuned component version number + */ +const char *mca_coll_tuned_component_version_string = + "Open MPI tuned collective MCA component version " OMPI_VERSION; + +/* + * Global variable + */ +int mca_coll_tuned_priority_param = -1; +int mca_coll_tuned_init_tree_fanout_param = -1; +int mca_coll_tuned_init_chain_fanout_param = -1; + +/* + * Local function + */ +static int tuned_open(void); + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +const mca_coll_base_component_1_0_0_t mca_coll_tuned_component = { + + /* First, the mca_component_t struct containing meta information + about the component itself */ + + { + /* Indicate that we are a coll v1.0.0 component (which also implies a + specific MCA version) */ + + MCA_COLL_BASE_VERSION_1_0_0, + + /* Component name and version */ + + "tuned", + OMPI_MAJOR_VERSION, + OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION, + + /* Component open and close functions */ + + tuned_open, + NULL + }, + + /* Next the MCA v1.0.0 component meta data */ + + { + /* Whether the component is checkpointable or not */ + + true + }, + + /* Initialization / querying functions */ + + mca_coll_tuned_init_query, + mca_coll_tuned_comm_query, + NULL +}; + + +static int tuned_open(void) +{ +/* mca_coll_tuned_component_t *ct = &mca_coll_tuned_component; */ + + /* Use a low priority, but allow other components to be lower */ + + mca_coll_tuned_priority_param = + mca_base_param_register_int("coll", "tuned", "priority", NULL, 30); + + mca_coll_tuned_init_tree_fanout_param = + mca_base_param_register_int("coll", "tuned", "init_tree_fanout", NULL, 2); + + mca_coll_tuned_init_chain_fanout_param = + mca_base_param_register_int("coll", "tuned", "init_chain_fanout", NULL, 4); + + printf("mca_coll_tuned_init_tree_fanout_param %d\nmca_coll_tuned_init_chain_fanout_param %d\n", mca_coll_tuned_init_tree_fanout_param, + mca_coll_tuned_init_chain_fanout_param); + +/* use the newer interface rsn */ +/* mca_coll_tuned_priority_param = mca_base_param_reg_int(&(ct->super), "priority", "Priority of the tuned coll component", */ +/* false, false, 30, NULL); */ + +/* mca_base_param_reg_int(&(ct->super), "init_tree_fanout", "Fan out used for [balanced] tree topologies in the tuned coll component", */ +/* false, false, 2, NULL); */ + +/* mca_base_param_reg_int(&(ct->super), "init_chain_fanout", */ +/* "Fan out used for chain [1 fanout followed by pipelines] topology in the tuned coll component", */ +/* false, false, 2, NULL); */ + + + return OMPI_SUCCESS; +} + diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c new file mode 100644 index 0000000000..8fe1c97ca6 --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "coll_tuned.h" + +#include "mpi.h" +#include "ompi/include/constants.h" + +#include +#include +#include + +#include "coll_tuned_dynamic_rules.h" + +int static rule_count = 0; +int static cond_count = 0; + +/* makes a rule */ +/* this is hanging in space until linked to either other rules or a */ +/* collective decision function */ +rule_t* mk_rule () +{ +rule_t* ptr; + +ptr = (rule_t*) calloc (1, sizeof(rule_t)); +if (!ptr) { + fprintf(stderr,"calloc on mk_rule failed!\n"); + exit (-1); +} + +/* set values in the hanging rule */ +ptr->rule_id = rule_count++; + +return (ptr); +} + +/* adds a condition to a rule */ +int mk_and_add_condition_to_rule (rule_t* rule, param_index_t param, + condition_op_t op, int target) +{ +condition_t* ptr; +condition_t* last; + +if (!rule) { + fprintf(stderr,"rule given in add_condition_to_rule is NULL?!\n"); + return (-2); +} +if (param>=PARAMS) { + fprintf(stderr,"param given in add_condition_to_rule is %d?!\n", param); + return (-3); +} + +ptr = (condition_t*) calloc (1, sizeof(condition_t)); +if (!ptr) { + fprintf(stderr,"calloc on add_condition_to_rule failed!\n"); + return (-5); +} + +/* set values in the condition */ +ptr->cond_id = cond_count++; +ptr->param = param; +ptr->op = op; +ptr->value = target; + +/* set values in the rule */ +if (rule->nconditions) { /* if we already have conditions add to the last one */ + last = rule->last_condition; + last->next = ptr; +} +else { /* its the very first / head condition */ + rule->first_condition = ptr; +} + +/* common to both all cases */ +rule->nconditions++; +rule->last_condition = ptr; + +return (0); +} + +/* attaches a rules/collective functions TO a rule (not otherway round) */ +int set_rule_links (rule_t * rule, ifp true_fptr, int* true_extraargs, + ifp false_fptr, int* false_extraargs, + rule_t* true_rule, rule_t* false_rule) +{ + +if (!rule) { + fprintf(stderr,"rule given in set_rule_links is NULL?!\n"); + return (-2); +} + +/* check rule results.. we must have one set for true and one for false */ +if ((true_fptr)&&(true_rule)) { + fprintf(stderr,"BAD. Two links for TRUE on rule %d!\n", rule->rule_id); + return (-6); +} +if ((false_fptr)&&(false_rule)) { + fprintf(stderr,"BAD. Two links for FALSE on rule %d!\n", rule->rule_id); + return (-7); +} +if ((!true_fptr)&&(!true_rule)) { + fprintf(stderr,"BAD. NO links for TRUE on rule %d!\n", rule->rule_id); + return (-8); +} +if ((!false_fptr)&&(!false_rule)) { + fprintf(stderr,"BAD. NO links for FALSE on rule %d!\n", rule->rule_id); + return (-9); +} + +/* can set the links now */ +rule->true_fptr = true_fptr; +rule->true_extraargs = true_extraargs; +rule->next_true_rule = (struct rules_s *) true_rule; +rule->false_fptr = false_fptr; +rule->false_extraargs = false_extraargs; +rule->next_false_rule = (struct rules_s *) false_rule; + +return (0); +} + +/* free rule (and all attached conditions */ +/* oneday multiple rules might be able to share conditions to save memory */ +int free_rule (rule_t *rule) +{ +condition_t* ptr; +condition_t* next; +int i; + +if (!rule) { + fprintf(stderr,"rule given in free_rule is NULL?!\n"); + return (-2); +} + +/* free conditions first */ +if (rule->nconditions) { + ptr = rule->first_condition; + for (i=0;inconditions;i++) { + next = ptr->next; + free (ptr); + ptr = next; + } +} + +/* all conditions freed, free the rule */ +free (rule); + +return (0); +} + +/* evaluates a rule and returns the final function pointer that matches */ +int eval_rule (rule_t *rule, params_t *params, ifp* fptr, int** extraargs) +{ +rule_t* currentrule; +condition_t* currentcond; +int true=1; + +if (!rule) { + fprintf(stderr,"rule given in eval_rule is NULL?!\n"); + return (-2); +} + +/* first special case is a very fast path... sorta not really grr */ +if (!rule->nconditions) { +#ifdef VERBOSE + printf("Rule %d has no conditions so forcing first available\n", + rule->rule_id); +#endif /* VERBOSE */ + *fptr = rule->true_fptr; + return (0); +} + +/* ok we have some conditions so start the evaluation */ +/* make it as lazy as possible, so a single false condition bumps us */ + +currentrule = rule; + +while (currentrule) { /* rules to evaluate */ +#ifdef RULEVERBOSE + printf("Eval Rule %d ", currentrule->rule_id); +#endif + + /* eval each of the current rules conditions */ + /* we do this until we have either: */ + /* (a) completed all conditions and are still true */ + /* (b) found a single false */ + /* once we have either a true true or false we can then check */ + /* the next values and see if we are returning a function */ + /* OR skipping to the next rule and hense iterate */ + + true = 1; /* we are ok so far */ + + /* first get the first condition to eval */ + currentcond = currentrule->first_condition; + + while ((currentcond)&&(true)) { /* while conditions to eval */ +#ifdef RULEVERBOSE + printf("Eval Cond %d ", currentcond->cond_id); +#endif + switch (currentcond->op) { + case LT: if (params->values[currentcond->param] < currentcond->value) {true = 1;} + else {true = 0;} + break; + case LTEQ: if (params->values[currentcond->param] <= currentcond->value) {true = 1;} + else {true = 0;} + break; + case GT: if (params->values[currentcond->param] > currentcond->value) {true = 1;} + else {true = 0;} + break; + case GTEQ: if (params->values[currentcond->param] >= currentcond->value) {true = 1;} + else {true = 0;} + break; + case EQ: if (params->values[currentcond->param] == currentcond->value) {true = 1;} + else {true = 0;} + break; + default: + fprintf(stderr, "Eval: BAD operator of value %d rule %d cond %d\n", + currentcond->op, currentrule->rule_id, + currentcond->cond_id); + true = 0; + return (-1); /* ?! what else can I do, should have caught before */ + } /* switch on condition operator */ + + /* if we are still true we go to the next condition if there is one */ + /* if there is not another then we are truely true */ + /* else if we are false, immediately fall out */ + + if (!true) { +#ifdef RULEVERBOSE + printf("Eval Cond %d returned FALSE\n", currentcond->cond_id); +#endif + break; /* if false drop out asap */ + } + if ((true)&&(currentcond->next)) { /* next condition to check */ +#ifdef RULEVERBOSE + printf("Eval Cond %d returned TRUE. Moving to next\n", currentcond->cond_id); +#endif + currentcond = currentcond->next; + } + else { /* we are true with no more conditions to check */ +#ifdef RULEVERBOSE + printf("Eval Cond %d (LAST) returned TRUE.\n", currentcond->cond_id); +#endif + break; /* so return so we can find out what to do next */ + } + + } /* while conditions to eval or a false */ + + /* condition is either fully met or lazy false */ + /* we do these IFs in the fasted/most important order */ + if ((true)&&(currentrule->true_fptr)) { +#ifdef RULEVERBOSE + printf("Eval Rule %d is TRUE returning fptr\n", currentrule->rule_id); +#endif + *fptr = currentrule->true_fptr; + *extraargs = currentrule->true_extraargs; + return (0); + } + if ((!true)&&(currentrule->false_fptr)) { +#ifdef RULEVERBOSE + printf("Eval Rule %d is FALSE returning fptr\n", currentrule->rule_id); +#endif + *fptr = currentrule->false_fptr; + *extraargs = currentrule->false_extraargs; + return (0); + } + if (true) { +#ifdef RULEVERBOSE + printf("Eval Rule %d is TRUE jumping to next rule %d\n", + currentrule->rule_id, + ((rule_t*)(currentrule->next_true_rule))->rule_id); +#endif + currentrule = (rule_t *) currentrule->next_true_rule; + } + else { /* i.e. not true / lazy eval */ +#ifdef RULEVERBOSE + printf("Eval Rule %d is FALSE jumping to next rule %d\n", + currentrule->rule_id, + ((rule_t*)(currentrule->next_false_rule))->rule_id); +#endif + currentrule = (rule_t *) currentrule->next_false_rule; + } + + if (!currentrule) { + fprintf(stderr, "eval: disaster, we have gone off into the weeds.. panic!\n"); + exit (-10); + } +} + + +} + + diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.h b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.h new file mode 100644 index 0000000000..c23a938d64 --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_rules.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_TUNED_DYNAMIC_RULES_EXPORT_H +#define MCA_COLL_TUNED_DYNAMIC_RULES_EXPORT_H + +#include "ompi_config.h" +#include "coll_tuned.h" + +#include "mpi.h" +#include "ompi/include/constants.h" + +#include +#include +#include + + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/* type is prob not needed - GEF */ +typedef enum { + RULE, + FPTR +} type_t; + + +typedef enum { + LT, + LTEQ, + GT, + GTEQ, + EQ +} condition_op_t; + + +typedef enum { + COMMSIZE, + COMMPOWER2, + DATASIZE, + DATAPOWER2, + DATACONTG, + NONZEROROOT +} param_index_t; + +#define PARAMS 6 + +typedef struct { + int values[PARAMS]; +} params_t; + + + +typedef struct condition_s { + int cond_id; + param_index_t param; + condition_op_t op; + int value; + struct condition_s *next; +} condition_t; + + +typedef int (*ifp)(); + + +typedef struct rule_s { + int rule_id; + /* type_t type; */ + /* not sure if we need different types of RULEs yet? */ + /* maybe a faster single condition rule to avoid loops and more ifs */ + /* current eval thinks that a nconditions value of 0 = true fptr */ + int nconditions; + /* we have a ptr to first and last just to speed up eval and add_to */ + condition_t *first_condition; + condition_t *last_condition; + ifp true_fptr; + int* true_extraargs; + ifp false_fptr; + int* false_extraargs; + struct rules_s* next_true_rule; + struct rules_s* next_false_rule; +} rule_t; + + + +rule_t* mk_rule (); + +int mk_and_add_condition_to_rule (rule_t* rule, param_index_t param, + condition_op_t op, int target); + +int set_rule_links (rule_t * rule, ifp true_fptr, int* true_extraargs, + ifp false_fptr, int* false_extraargs, + rule_t* true_rule, rule_t* false_rule); + +int free_rule (rule_t *rule); + +int eval_rule (rule_t* rule, params_t* params, ifp* fptr, int** extraargs); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif + + + +#endif /* MCA_COLL_TUNED_DYNAMIC_RULES_EXPORT_H */ diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c new file mode 100644 index 0000000000..c4750d2dc4 --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "coll_tuned.h" + +#include + +#include "mpi.h" +#include "communicator/communicator.h" +#include "mca/base/mca_base_param.h" +#include "mca/coll/coll.h" +#include "mca/coll/base/base.h" +#include "coll_tuned.h" +#include "coll_tuned_topo.h" + + +/* + * Which set are we using? + */ +static const mca_coll_base_module_1_0_0_t *to_use = NULL; + +/* + * Intra communicator decision functions + */ +static const mca_coll_base_module_1_0_0_t intra = { + + /* Initialization / finalization functions */ + + mca_coll_tuned_module_init, + mca_coll_tuned_module_finalize, + + /* Collective function pointers */ + +/* mca_coll_tuned_allgather_intra_dec, */ + NULL, +/* mca_coll_tuned_allgatherv_intra_dec, */ + NULL, +/* mca_coll_tuned_allreduce_intra_dec, */ + NULL, +/* mca_coll_tuned_alltoall_intra_dec, */ + NULL, +/* mca_coll_tuned_alltoallv_intra_dec, */ + NULL, +/* mca_coll_tuned_alltoallw_intra_dec, */ + NULL, +/* mca_coll_tuned_barrier_intra_dec, */ + NULL, + mca_coll_tuned_bcast_intra_dec, +/* NULL, */ +/* mca_coll_tuned_exscan_intra_dec, */ + NULL, +/* mca_coll_tuned_gather_intra_dec, */ + NULL, +/* mca_coll_tuned_gatherv_intra_dec, */ + NULL, +/* mca_coll_tuned_reduce_intra_dec, */ + NULL, +/* mca_coll_tuned_reduce_scatter_intra_dec, */ + NULL, +/* mca_coll_tuned_scan_intra_dec, */ + NULL, +/* mca_coll_tuned_scatter_intra_dec, */ + NULL, +/* mca_coll_tuned_scatterv_intra_dec */ + NULL +}; + + +/* + * collective decision functions for intercommunicators + */ +static const mca_coll_base_module_1_0_0_t inter = { + + /* Initialization / finalization functions */ + + mca_coll_tuned_module_init, + mca_coll_tuned_module_finalize, + + /* Collective function pointers */ + +/* mca_coll_tuned_allgather_inter_dec, */ + NULL, +/* mca_coll_tuned_allgatherv_inter_dec, */ + NULL, +/* mca_coll_tuned_allreduce_inter_dec, */ + NULL, +/* mca_coll_tuned_alltoall_inter_dec, */ + NULL, +/* mca_coll_tuned_alltoallv_inter_dec, */ + NULL, +/* mca_coll_tuned_alltoallw_inter_dec, */ + NULL, +/* mca_coll_tuned_barrier_inter_dec, */ + NULL, +/* mca_coll_tuned_bcast_inter_dec, */ + NULL, + /* mca_coll_tuned_exscan_inter_dec, */ + NULL, +/* mca_coll_tuned_gather_inter_dec, */ + NULL, +/* mca_coll_tuned_gatherv_inter_dec, */ + NULL, +/* mca_coll_tuned_reduce_inter_dec, */ + NULL, +/* mca_coll_tuned_reduce_scatter_inter_dec, */ + NULL, + /* mca_coll_tuned_scan_inter_dec, */ + NULL, +/* mca_coll_tuned_scatter_inter_dec, */ + NULL, +/* mca_coll_tuned_scatterv_inter_dec */ + NULL +}; + +/* Note I keep the names here as place markers until I have implemented the functions */ + + +/* + * Initial query function that is invoked during MPI_INIT, allowing + * this component to disqualify itself if it doesn't support the + * required level of thread support. + */ +int mca_coll_tuned_init_query(bool enable_progress_threads, + bool enable_mpi_threads) +{ + /* Nothing to do */ + + return OMPI_SUCCESS; +} + + +/* + * Invoked when there's a new communicator that has been created. + * Look at the communicator and decide which set of functions and + * priority we want to return. + */ +const mca_coll_base_module_1_0_0_t * +mca_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority, + struct mca_coll_base_comm_t **data) +{ + + printf("Tuned query called\n"); + if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_tuned_priority_param, + priority)) { + return NULL; + } + + /* Choose whether to use [intra|inter] decision functions */ + + if (OMPI_COMM_IS_INTER(comm)) { + to_use = &inter; + } else { + to_use = &intra; + } + + return to_use; +} + + +/* + * Init module on the communicator + */ +const struct mca_coll_base_module_1_0_0_t * +mca_coll_tuned_module_init(struct ompi_communicator_t *comm) +{ + int size; + struct mca_coll_base_comm_t *data; + /* fanout parameters */ + int tree_fanout_default = 2; + int chain_fanout_default = 4; + + + printf("Tuned init module called.\n"); + + /* This routine will become more complex and might have to be broken into sections */ + + /* Order of operations: + * alloc memory for nb reqs (in case we fall through) + * add decision rules if using dynamic rules + * compact rules using communicator size info etc + * build first guess cached topologies (might depend on the rules from above) + * + * then attach all to the communicator and return base module funct ptrs + */ + + /* Allocate the data that hangs off the communicator */ + + if (OMPI_COMM_IS_INTER(comm)) { + size = ompi_comm_remote_size(comm); + } else { + size = ompi_comm_size(comm); + } + data = malloc(sizeof(struct mca_coll_base_comm_t) + + (sizeof(ompi_request_t *) * size * 2)); + + if (NULL == data) { + return NULL; + } + data->mccb_reqs = (ompi_request_t **) (data + 1); + data->mccb_num_reqs = size * 2; + + /* + * now for the cached topo functions + * guess the initial topologies to use rank 0 as root + */ + + /* get default fanouts is made available via the MCA */ + tree_fanout_default = 2; /* make it binary for now */ + chain_fanout_default = 4; + + + data->cached_tree = ompi_coll_tuned_topo_build_tree (tree_fanout_default, comm, 0); + data->cached_tree_root = 0; + data->cached_tree_fanout = tree_fanout_default; + + data->cached_bmtree = ompi_coll_tuned_topo_build_bmtree (comm, 0); + data->cached_tree_root = 0; + + /* + * chains (fanout followed by pipelines) + * are more difficuilt as the fan out really really depends on message size [sometimes].. + * as size gets larger fan-out gets smaller [usually] + * + * will probably change how we cache this later, for now a midsize + * GEF + */ + data->cached_chain = ompi_coll_tuned_topo_build_chain (chain_fanout_default, comm, 0); + data->cached_chain_root = 0; + data->cached_chain_fanout = chain_fanout_default; + + /* All done */ + + comm->c_coll_selected_data = data; + + printf("Tuned looks like it is in use :)\n"); + return to_use; +} + + +/* + * Finalize module on the communicator + */ +int mca_coll_tuned_module_finalize(struct ompi_communicator_t *comm) +{ + if (NULL == comm->c_coll_selected_module) { + return OMPI_SUCCESS; + } + +#if OMPI_ENABLE_DEBUG + /* Reset the reqs to NULL/0 -- they'll be freed as part of freeing + the generel c_coll_selected_data */ + + comm->c_coll_selected_data->mccb_reqs = NULL; + comm->c_coll_selected_data->mccb_num_reqs = 0; +#endif + + /* All done */ + + free(comm->c_coll_selected_data); + comm->c_coll_selected_data = NULL; + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/tuned/coll_tuned_topo.c b/ompi/mca/coll/tuned/coll_tuned_topo.c new file mode 100644 index 0000000000..9a6f372dbd --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned_topo.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/include/constants.h" +#include "datatype/datatype.h" +#include "communicator/communicator.h" +#include "mca/coll/coll.h" +#include "mca/coll/base/coll_tags.h" +#include "mca/pml/pml.h" +#include "coll_tuned_topo.h" + +/* + * Some static helpers. + */ +static int pown( int fanout, int num ) +{ + int j, p = 1; + if( num < 0 ) return 0; + for( j = 0; j < num; j++ ) p*= fanout; + return p; +} + +static int calculate_level( int fanout, int rank ) +{ + int level, num; + if( rank < 0 ) return -1; + for( level = 0, num = 0; num <= rank; level++ ) { + num += pown(fanout, level); + } + return level-1; +} + +static int calculate_num_nodes_up_to_level( int fanout, int level ) +{ + /* just use geometric progression formula for sum: + a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */ + return ((pown(fanout,level) - 1)/(fanout - 1)); +} + +/* + * And now the building functions. + */ + +ompi_coll_tree_t* +ompi_coll_tuned_topo_build_tree( int fanout, + struct ompi_communicator_t* comm, + int root ) +{ + int rank, size; + int schild, sparent; + int level; /* location of my rank in the tree structure of size */ + int delta; /* number of nodes on my level */ + int slimit; /* total number of nodes on levels above me */ + int shiftedrank; + int i; + ompi_coll_tree_t* tree; + + printf("Building tuned topo tree: fo %d rt %d\n", fanout, root); + + /* + * Get size and rank of the process in this communicator + */ + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); + + tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); + tree->tree_root = MPI_UNDEFINED; + tree->tree_nextsize = MPI_UNDEFINED; + + /* + * Check if we calculated the tree for this root and + * fanout combination already (on this communicator) + */ + if( (root == tree->tree_root) && (fanout == tree->tree_fanout) ) { + return tree; + } + + /* + * Set root + */ + tree->tree_root = root; + + /* + * Initialize tree + */ + tree->tree_fanout = fanout; + tree->tree_root = root; + tree->tree_prev = -1; + tree->tree_nextsize = 0; + for( i = 0; i < fanout; i++ ) { + tree->tree_next[i] = -1; + } + + /* return if we have less than 2 processes */ + if( size < 2 ) { + return tree; + } + + /* + * Shift all ranks by root, so that the algorithm can be + * designed as if root would be always 0 + * shiftedrank should be used in calculating distances + * and position in tree + */ + shiftedrank = rank - root; + if( shiftedrank < 0 ) { + shiftedrank += size; + } + + /* calculate my level */ + level = calculate_level( fanout, shiftedrank ); + delta = pown( fanout, level ); + + /* find my children */ + for( i = 0; i < fanout; i++ ) { + schild = shiftedrank + delta * (i+1); + if( schild < size ) { + tree->tree_next[i] = (schild+root)%size; + tree->tree_nextsize = tree->tree_nextsize + 1; + } else { + break; + } + } + + /* find my parent */ + slimit = calculate_num_nodes_up_to_level( fanout, level ); + sparent = shiftedrank; + if( sparent < fanout ) { + sparent = 0; + } else { + while( sparent >= slimit ) { + sparent -= delta/fanout; + } + } + tree->tree_prev = (sparent+root)%size; + + return tree; +} + +int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree ) +{ + return OMPI_SUCCESS; +} + +ompi_coll_bmtree_t* +ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, + int root ) +{ + int childs = 0; + int rank; + int size; + int mask = 1; + int index; + int remote; + ompi_coll_bmtree_t *bmtree; + + printf("Building tuned topo bmtree: rt %d\n", root); + + /* + * Get size and rank of the process in this communicator + */ + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); + + index = rank -root; + + bmtree = (ompi_coll_bmtree_t*)malloc(sizeof(ompi_coll_bmtree_t)); + bmtree->bmtree_root = MPI_UNDEFINED; + bmtree->bmtree_nextsize = MPI_UNDEFINED; + + if( bmtree->bmtree_root == root ) { + /* the bmtree was computed before */ + return bmtree; + } + + if( index < 0 ) index += size; + + while( mask <= index ) mask <<= 1; + + /* Now I can compute my father rank */ + if( root == rank ) { + bmtree->bmtree_prev = root; + } else { + remote = (index ^ (mask >> 1)) + root; + if( remote >= size ) remote -= size; + bmtree->bmtree_prev = remote; + } + /* And now let's fill my childs */ + while( mask < size ) { + remote = (index ^ mask); + if( remote >= size ) break; + remote += root; + if( remote >= size ) remote -= size; + bmtree->bmtree_next[childs] = remote; + mask <<= 1; + childs++; + } + bmtree->bmtree_nextsize = childs; + bmtree->bmtree_root = root; + return bmtree; +} + +int ompi_coll_tuned_topo_destroy_bmtree( ompi_coll_bmtree_t** bmtree ) +{ + return OMPI_SUCCESS; +} + +ompi_coll_chain_t* +ompi_coll_tuned_topo_build_chain( int fanout, + struct ompi_communicator_t* comm, + int root ) +{ + int rank, size; + int srank; /* shifted rank */ + int i,maxchainlen; + int mark,head,len; + ompi_coll_chain_t *chain; + + printf("Building tuned topo chain: fo %d rt %d\n", fanout, root); + + /* + * Get size and rank of the process in this communicator + */ + size = ompi_comm_size(comm); + rank = ompi_comm_rank(comm); + + if( fanout > MAXTREEFANOUT ) { + return NULL; + } + + /* + * Allocate space for topology arrays if needed + */ + chain = (ompi_coll_chain_t*)malloc( sizeof(ompi_coll_chain_t) ); + chain->chain_root = MPI_UNDEFINED; + chain->chain_nextsize = -1; + chain->chain_numchain = -1; + + /* + * Check if we calculated the topology for this root and comm + */ + if( (root == chain->chain_root) && + (fanout == chain->chain_numchain) ) { + return chain; + } + /* + * Set root & numchain + */ + chain->chain_root = root; + if( (size - 1) < fanout ) { + chain->chain_numchain = size-1; + chain->chain_nextsize = size-1; + fanout = size-1; + } else { + chain->chain_numchain = fanout; + chain->chain_nextsize = fanout; + } + + /* + * Shift ranks + */ + srank = rank - root; + if (srank < 0) srank += size; + + /* + * Special case - fanout == 1 + */ + if( fanout == 1 ) { + if( srank == 0 ) chain->chain_prev = -1; + else chain->chain_prev = (srank-1+root)%size; + + if( (srank + 1) >= size) { + chain->chain_next[0] = -1; + chain->chain_nextsize = 0; + } else { + chain->chain_next[0] = (srank+1+root)%size; + chain->chain_nextsize = 1; + } + return chain; + } + + /* Let's handle the case where there is just one node in the communicator */ + if( size == 1 ) { + chain->chain_next[0] = -1; + chain->chain_nextsize = 0; + chain->chain_prev = -1; + chain->chain_numchain = 0; + return chain; + } + /* + * Calculate maximum chain length + */ + maxchainlen = (size-1) / fanout; + if( (size-1) % fanout != 0 ) { + maxchainlen++; + mark = (size-1)%fanout; + } else { + mark = fanout+1; + } + + /* + * Find your own place in the list of shifted ranks + */ + if( srank != 0 ) { + int column; + if( srank-1 < (mark * maxchainlen) ) { + column = (srank-1)/maxchainlen; + head = 1+column*maxchainlen; + len = maxchainlen; + } else { + column = mark + (srank-1-mark*maxchainlen)/(maxchainlen-1); + head = mark*maxchainlen+1+(column-mark)*(maxchainlen-1); + len = maxchainlen-1; + } + + if( srank == head ) { + chain->chain_prev = 0; /*root*/ + } else { + chain->chain_prev = srank-1; /* rank -1 */ + } + if( srank == (head + len - 1) ) { + chain->chain_next[0] = -1; + chain->chain_nextsize = 0; + } else { + if( (srank + 1) < size ) { + chain->chain_next[0] = srank+1; + chain->chain_nextsize = 1; + } else { + chain->chain_next[0] = -1; + chain->chain_nextsize = 0; + } + } + } + + /* + * Unshift values + */ + if( rank == root ) { + chain->chain_prev = -1; + chain->chain_next[0] = (root+1)%size; + for( i = 1; i < fanout; i++ ) { + chain->chain_next[i] = chain->chain_next[i-1] + maxchainlen; + if( i > mark ) { + chain->chain_next[i]--; + } + chain->chain_next[i] %= size; + } + chain->chain_nextsize = fanout; + } else { + chain->chain_prev = (chain->chain_prev+root)%size; + if( chain->chain_next[0] != -1 ) { + chain->chain_next[0] = (chain->chain_next[0]+root)%size; + } + } + + return chain; +} + +int ompi_coll_tuned_topo_destroy_chain( ompi_coll_chain_t** chain ) +{ + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/tuned/coll_tuned_topo.h b/ompi/mca/coll/tuned/coll_tuned_topo.h new file mode 100644 index 0000000000..582f6a40fb --- /dev/null +++ b/ompi/mca/coll/tuned/coll_tuned_topo.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED +#define MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED + +#include "ompi_config.h" + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" +{ +#endif + +#define MAXTREEFANOUT 32 + +typedef struct ompi_coll_tree_t { + int32_t tree_root; + int32_t tree_fanout; + int32_t tree_prev; + int32_t tree_next[MAXTREEFANOUT]; + int32_t tree_nextsize; +} ompi_coll_tree_t; + +typedef struct ompi_coll_bmtree_t { + int32_t bmtree_root; + int32_t bmtree_prev; + int32_t bmtree_next[MAXTREEFANOUT]; + int32_t bmtree_nextsize; +} ompi_coll_bmtree_t; + +typedef struct ompi_coll_chain_t { + int32_t chain_root; + int32_t chain_prev; + int32_t chain_next[MAXTREEFANOUT]; + int32_t chain_nextsize; + int32_t chain_numchain; +} ompi_coll_chain_t; + +ompi_coll_tree_t* +ompi_coll_tuned_topo_build_tree( int fanout, + struct ompi_communicator_t* com, + int root ); +int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree ); + +ompi_coll_bmtree_t* +ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, + int root ); +int ompi_coll_tuned_topo_destroy_bmtree( ompi_coll_bmtree_t** bmtree ); + +ompi_coll_chain_t* +ompi_coll_tuned_topo_build_chain( int fanout, + struct ompi_communicator_t* com, + int root ); +int ompi_coll_tuned_topo_destroy_chain( ompi_coll_chain_t** chain ); + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif + +#endif /* MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED */ + diff --git a/ompi/mca/coll/tuned/configure.params b/ompi/mca/coll/tuned/configure.params new file mode 100644 index 0000000000..25354fd739 --- /dev/null +++ b/ompi/mca/coll/tuned/configure.params @@ -0,0 +1,21 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University. +# All rights reserved. +# Copyright (c) 2004-2005 The Trustees of the University of Tennessee. +# All rights reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module + +PARAM_INIT_FILE=coll_tuned.c +PARAM_CONFIG_FILES=Makefile