1
1
do not use

This commit was SVN r7103.
Этот коммит содержится в:
Graham Fagg 2005-08-31 01:43:48 +00:00
родитель d64a702a5b
Коммит 1caec16018
13 изменённых файлов: 2183 добавлений и 0 удалений

0
ompi/mca/coll/tuned/.ompi_ignore Обычный файл
Просмотреть файл

1
ompi/mca/coll/tuned/.ompi_unignore Обычный файл
Просмотреть файл

@ -0,0 +1 @@
gef

49
ompi/mca/coll/tuned/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,49 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Use the top-level Makefile.options
include $(top_ompi_srcdir)/config/Makefile.options
sources = \
coll_tuned.h \
coll_tuned_topo.h \
coll_tuned_topo.c \
coll_tuned_bcast_decision.c \
coll_tuned_bcast.c \
coll_tuned_component.c \
coll_tuned_module.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if OMPI_BUILD_coll_tuned_DSO
component_noinst =
component_install = mca_coll_tuned.la
else
component_noinst = libmca_coll_tuned.la
component_install =
endif
mcacomponentdir = $(libdir)/openmpi
mcacomponent_LTLIBRARIES = $(component_install)
mca_coll_tuned_la_SOURCES = $(sources)
mca_coll_tuned_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_coll_tuned_la_SOURCES =$(sources)
libmca_coll_tuned_la_LDFLAGS = -module -avoid-version

307
ompi/mca/coll/tuned/coll_tuned.h Обычный файл
Просмотреть файл

@ -0,0 +1,307 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_EXPORT_H
#define MCA_COLL_TUNED_EXPORT_H
#include "ompi_config.h"
#include "mpi.h"
#include "mca/mca.h"
#include "mca/coll/coll.h"
#include "request/request.h"
#include "mca/pml/pml.h"
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
#include "coll_tuned_topo.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Globally exported variable
*/
OMPI_COMP_EXPORT extern const mca_coll_base_component_1_0_0_t mca_coll_tuned_component;
OMPI_COMP_EXPORT extern int mca_coll_tuned_priority_param;
/*
* coll API functions
*/
/* API functions */
int mca_coll_tuned_init_query(bool enable_progress_threads,
bool enable_mpi_threads);
const struct mca_coll_base_module_1_0_0_t *
mca_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority,
struct mca_coll_base_comm_t **data);
const struct mca_coll_base_module_1_0_0_t *
mca_coll_tuned_module_init(struct ompi_communicator_t *comm);
int mca_coll_tuned_module_finalize(struct ompi_communicator_t *comm);
/* API functions of decision functions and any implementations */
int mca_coll_tuned_allgather_intra_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_allgather_inter_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void *rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_allgatherv_intra_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void * rbuf, int *rcounts, int *disps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_allgatherv_inter_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void * rbuf, int *rcounts, int *disps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_allreduce_intra_dec(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm);
int mca_coll_tuned_allreduce_inter_dec(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm);
int mca_coll_tuned_alltoall_intra_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_alltoall_inter_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_alltoallv_intra_dec(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_alltoallv_inter_dec(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t *sdtype,
void *rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm);
int mca_coll_tuned_alltoallw_intra_dec(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t **sdtypes,
void *rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t **rdtypes,
struct ompi_communicator_t *comm);
int mca_coll_tuned_alltoallw_inter_dec(void *sbuf, int *scounts, int *sdisps,
struct ompi_datatype_t **sdtypes,
void *rbuf, int *rcounts, int *rdisps,
struct ompi_datatype_t **rdtypes,
struct ompi_communicator_t *comm);
int mca_coll_tuned_barrier_intra_dec(struct ompi_communicator_t *comm);
int mca_coll_tuned_barrier_inter_dec(struct ompi_communicator_t *comm);
int mca_coll_tuned_bcast_intra_dec(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_bcast_intra_linear(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_bcast_intra_chain(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
uint32_t segsize, int32_t chains);
int mca_coll_tuned_bcast_intra_pipeline(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
uint32_t segsize);
int mca_coll_tuned_bcast_intra_bmtree(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
uint32_t segsize, int32_t chains);
int mca_coll_tuned_bcast_intra_bintree(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
uint32_t segsize);
int mca_coll_tuned_bcast_inter_dec(void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_exscan_intra_dec(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm);
int mca_coll_tuned_exscan_inter_dec(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm);
int mca_coll_tuned_gather_intra_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype, void *rbuf,
int rcount, struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm);
int mca_coll_tuned_gather_inter_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype, void *rbuf,
int rcount, struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm);
int mca_coll_tuned_gatherv_intra_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype, void *rbuf,
int *rcounts, int *disps,
struct ompi_datatype_t *rdtype, int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_gatherv_inter_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype, void *rbuf,
int *rcounts, int *disps,
struct ompi_datatype_t *rdtype, int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_reduce_intra_dec(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_reduce_inter_dec(void *sbuf, void* rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_reduce_scatter_intra_dec(void *sbuf, void *rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm);
int mca_coll_tuned_reduce_scatter_inter_dec(void *sbuf, void *rbuf,
int *rcounts,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm);
int mca_coll_tuned_scan_intra_dec(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm);
int mca_coll_tuned_scan_inter_dec(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm);
int mca_coll_tuned_scatter_intra_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype, void *rbuf,
int rcount, struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm);
int mca_coll_tuned_scatter_inter_dec(void *sbuf, int scount,
struct ompi_datatype_t *sdtype, void *rbuf,
int rcount, struct ompi_datatype_t *rdtype,
int root, struct ompi_communicator_t *comm);
int mca_coll_tuned_scatterv_intra_dec(void *sbuf, int *scounts, int *disps,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype, int root,
struct ompi_communicator_t *comm);
int mca_coll_tuned_scatterv_inter_dec(void *sbuf, int *scounts, int *disps,
struct ompi_datatype_t *sdtype,
void* rbuf, int rcount,
struct ompi_datatype_t *rdtype, int root,
struct ompi_communicator_t *comm);
/* Utility functions */
static inline void mca_coll_tuned_free_reqs(ompi_request_t **reqs, int count)
{
int i;
for (i = 0; i < count; ++i)
ompi_request_free(&reqs[i]);
}
/* decision table declaraion */
/* currently a place holder */
typedef struct rule_s {
} rule_t;
/*
* Data structure for hanging data off the communicator
*/
struct mca_coll_base_comm_t {
/* standard data for requests and PML usage */
/* we need to keep this here for now incase we fall through to the basic functions
* that expect these fields/and memory to be avaliable (GEF something for JS?)
*/
ompi_request_t **mccb_reqs;
int mccb_num_reqs;
/*
* tuned topo information caching per communicator
*
* for each communicator we cache the topo information so we can reuse without regenerating
* if we change the root, [or fanout] then regenerate and recache this information
*
*/
ompi_coll_tree_t *cached_tree;
int cached_tree_root;
int cached_tree_fanout;
ompi_coll_bmtree_t *cached_bmtree;
int cached_bmtree_root;
ompi_coll_chain_t *cached_chain;
int cached_chain_root;
int cached_chain_fanout;
/* extra data required by the decision functions */
rule_t* decision_table;
};
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* MCA_COLL_TUNED_EXPORT_H */

465
ompi/mca/coll/tuned/coll_tuned_bcast.c Обычный файл
Просмотреть файл

@ -0,0 +1,465 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "coll_tuned.h"
#include "mpi.h"
#include "ompi/include/constants.h"
#include "datatype/datatype.h"
#include "communicator/communicator.h"
#include "mca/coll/coll.h"
#include "mca/coll/base/coll_tags.h"
#include "mca/pml/pml.h"
/* external prototype we need */
extern int ompi_mpi_sendrecv( void* sendbuf, int scount, ompi_datatype_t* sdatatype,
int dest, int stag,
void* recvbuf, int rcount, ompi_datatype_t* rdata,
int source, int rtag,
struct ompi_communicator_t* comm,
ompi_status_public_t* status );
int
mca_coll_tuned_bcast_intra_chain ( void *buff, int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
uint32_t segsize, int32_t chains )
{
int err = 0, line, rank, size, segindex, i;
int segcount; /* Number of elements sent with each segment */
int num_segments; /* Number of segmenets */
int sendcount; /* the same like segcount, except for the last segment */
int new_sendcount; /* used to mane the size for the next pipelined receive */
int realsegsize;
char *tmpbuf = (char*)buff;
long ext;
int typelng;
MPI_Request base_req, new_req;
ompi_coll_chain_t* chain;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
printf("mca_coll_tuned_bcast_intra_chain rank %d root %d\n", rank, root);
if( size == 1 ) {
return MPI_SUCCESS;
}
/*
* setup the chain topology.
* if the previous chain topology is the same, then use this cached copy
* other wise recreate it.
*/
if ((comm->c_coll_selected_data->cached_chain) && (comm->c_coll_selected_data->cached_chain_root == root)
&& (comm->c_coll_selected_data->cached_chain_fanout == chains)) {
chain = comm->c_coll_selected_data->cached_chain;
}
else {
if (comm->c_coll_selected_data->cached_chain) { /* destroy previous chain if defined */
ompi_coll_tuned_topo_destroy_chain (&comm->c_coll_selected_data->cached_chain);
}
comm->c_coll_selected_data->cached_chain = chain = ompi_coll_tuned_topo_build_chain( chains, comm, root );
comm->c_coll_selected_data->cached_chain_root = root;
comm->c_coll_selected_data->cached_chain_fanout = chains;
}
ompi_ddt_type_size( datatype, &typelng );
/* Determine number of segments and number of elements
* sent per operation */
if( segsize == 0 ) {
/* no segmentation */
segcount = count;
num_segments = 1;
} else {
/* segment the message */
segcount = segsize / typelng;
num_segments = count / segcount;
if ((count % segcount)!= 0) num_segments++;
}
realsegsize = segcount*ext;
/* set the buffer pointer */
tmpbuf = (char *)buff;
/* root code */
if( rank == root ) {
/* for each segment */
sendcount = segcount;
for (segindex = 0; segindex < num_segments; segindex++) {
/* determine how many elements are being sent in this round */
if( segindex == (num_segments - 1) )
sendcount = count - segindex*segcount;
for( i = 0; i < chain->chain_nextsize; i++ ) {
err = MCA_PML_CALL(send(tmpbuf, sendcount, datatype,
chain->chain_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,comm));
if( MPI_SUCCESS != err ) { line = __LINE__; goto error_hndl; }
}
/* update tmp buffer */
tmpbuf += realsegsize;
}
}
/* intermediate nodes code */
else if (chain->chain_nextsize > 0) {
/* Create the pipeline. We first post the first receive, then in the loop we
* post the next receive and after that wait for the previous receive to
* complete and we disseminating the data to all children.
*/
new_sendcount = sendcount = segcount;
err = MCA_PML_CALL(recv( tmpbuf, sendcount, datatype,
chain->chain_prev, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
for( segindex = 1; segindex < num_segments; segindex++ ) {
/* determine how many elements to expect in this round */
if( segindex == (num_segments - 1))
new_sendcount = count - segindex*segcount;
/* post new irecv */
err = MCA_PML_CALL(irecv( tmpbuf + realsegsize, new_sendcount,
datatype, chain->chain_prev,
MCA_COLL_BASE_TAG_BCAST, comm, &new_req));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* wait for and forward current segment */
err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE );
for( i = 0; i < chain->chain_nextsize; i++ ) {
/* send data to children */
err = MCA_PML_CALL(send( tmpbuf, sendcount, datatype,
chain->chain_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
} /* end of for each child */
/* upate the base request */
base_req = new_req;
/* go to the next buffer (ie. the one corresponding to the next recv) */
tmpbuf += realsegsize;
sendcount = new_sendcount;
} /* end of for segindex */
/* wait for the last segment and forward current segment */
err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE );
for( i = 0; i < chain->chain_nextsize; i++ ) {
/* send data to children */
err = MCA_PML_CALL(send( tmpbuf, sendcount, datatype,
chain->chain_next[i],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
} /* end of for each child */
}
/* leaf nodes */
else {
sendcount = segcount;
for (segindex = 0; segindex < num_segments; segindex++) {
/* determine how many elements to expect in this round */
if (segindex == (num_segments - 1))
sendcount = count - segindex*segcount;
/* receive segments */
err = MCA_PML_CALL(recv( tmpbuf, sendcount, datatype,
chain->chain_prev, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* update the initial pointer to the buffer */
tmpbuf += realsegsize;
}
}
return (MPI_SUCCESS);
error_hndl:
fprintf(stderr,"%s:%d: Error %d occurred\n",__FILE__,line,err);
return (err);
}
int
mca_coll_tuned_bcast_intra_pipeline ( void *buffer,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
uint32_t segsize )
{
int rank; /* remove when removing print statement */
rank = ompi_comm_rank(comm); /* remove when removing print statement */
printf("mca_coll_tuned_bcast_intra_pipeline rank %d root %d\n", rank, root);
return mca_coll_tuned_bcast_intra_chain ( buffer, count, datatype, root, comm,
segsize, 1 );
}
int
mca_coll_tuned_bcast_intra_bintree ( void* buffer,
int count,
struct ompi_datatype_t* datatype,
int root,
struct ompi_communicator_t* comm,
uint32_t segsize )
{
int err=0, line;
int rank, size;
int segindex, i, lr, pair;
int segcount[2]; /* Number of elements sent with each segment */
uint32_t counts[2];
int num_segments[2]; /* Number of segmenets */
int sendcount[2]; /* the same like segcount, except for the last segment */
int realsegsize[2];
char *tmpbuf[2];
int type_size;
long type_extent;
MPI_Request base_req, new_req;
ompi_coll_tree_t *tree;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
printf("mca_coll_tuned_bcast_intra_bintree rank %d root %d\n", rank, root);
if (size == 1) {
return MPI_SUCCESS;
}
/*
* setup the tree topology.
* if the previous tree topology is the same, then use this cached copy
* other wise recreate it.
*/
if ((comm->c_coll_selected_data->cached_tree) && (comm->c_coll_selected_data->cached_tree_root == root)
&& (comm->c_coll_selected_data->cached_tree_fanout == 2)) {
tree = comm->c_coll_selected_data->cached_tree;
}
else {
if (comm->c_coll_selected_data->cached_tree) { /* destroy previous chain if defined */
ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_tree);
}
comm->c_coll_selected_data->cached_tree = tree = ompi_coll_tuned_topo_build_tree( 2, comm, root );
comm->c_coll_selected_data->cached_tree_root = root;
comm->c_coll_selected_data->cached_tree_fanout = 2;
}
err = ompi_ddt_type_size( datatype, &type_size );
/* Determine number of segments and number of elements per segment */
counts[0] = count/2;
if (count % 2 != 0) counts[0]++;
counts[1] = count - counts[0];
if ( segsize > 0 ) {
segcount[0] = segcount[1] = segsize / type_size;
num_segments[0] = counts[0]/segcount[0];
if ((counts[0] % segcount[0]) != 0) num_segments[0]++;
num_segments[1] = counts[1]/segcount[1];
if ((counts[1] % segcount[1]) != 0) num_segments[1]++;
} else {
segcount[0] = counts[0];
segcount[1] = counts[1];
num_segments[0] = num_segments[1] = 1;
}
/* if the message is too small to be split into segments */
if( (counts[0] == 0 || counts[1] == 0) ||
(segsize > counts[0] * type_size) ||
(segsize > counts[1] * type_size) ) {
/* call linear version here ! */
return (mca_coll_tuned_bcast_intra_bintree( buffer, count, datatype,
root, comm, segsize ));
}
/* Determine real segment size */
realsegsize[0] = segcount[0] * type_extent;
realsegsize[1] = segcount[1] * type_extent;
/* set the buffer pointers */
tmpbuf[0] = (char *) buffer;
tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
/* Step 1:
Root splits the buffer in 2 and sends segmented message down the branches.
Left subtree of the tree receives first half od the buffer, while right
subtree receives the remaining message.
*/
/* determine if I am left (0) or right (1), (root is right) */
lr = ((rank + size - root)%size + 1)%2;
/* root code */
if( rank == root ) {
/* determine segment count */
sendcount[0] = segcount[0];
sendcount[1] = segcount[1];
/* for each segment */
for (segindex = 0; segindex < num_segments[0]; segindex++) {
/* for each child */
for( i = 0; i < tree->tree_nextsize && i < 2; i++ ) {
if (segindex >= num_segments[i]) { /* no more segments */
continue;
}
/* determine how many elements are being sent in this round */
if(segindex == (num_segments[i] - 1))
sendcount[i] = counts[i] - segindex*segcount[i];
/* send data */
MCA_PML_CALL(send(tmpbuf[i], sendcount[i], datatype,
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* update tmp buffer */
tmpbuf[i] += realsegsize[i];
}
}
}
/* intermediate nodes code */
else if( tree->tree_nextsize > 0 ) {
/* Intermediate nodes:
* It will receive segments only from one half od the data.
* Which one is determined by whether the node belongs to the "left" or "right"
* subtree. Topoloby building function builds binary tree such that
* odd "shifted ranks" ((rank + size - root)%size) are on the left subtree,
* and even on the right subtree.
*
* Create the pipeline. We first post the first receive, then in the loop we
* post the next receive and after that wait for the previous receive to complete
* and we disseminating the data to all children.
*/
sendcount[lr] = segcount[lr];
MCA_PML_CALL(irecv(tmpbuf[lr], sendcount[lr], datatype,
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
comm, &base_req));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
for( segindex = 1; segindex < num_segments[lr]; segindex++ ) {
/* determine how many elements to expect in this round */
if( segindex == (num_segments[lr] - 1))
sendcount[lr] = counts[lr] - segindex*segcount[lr];
/* post new irecv */
MCA_PML_CALL(irecv( tmpbuf[lr] + realsegsize[lr], sendcount[lr],
datatype, tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
comm, &new_req));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* wait for and forward current segment */
err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE );
for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children (segcount[lr]) */
MCA_PML_CALL(send( tmpbuf[lr], segcount[lr], datatype,
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
} /* end of for each child */
/* upate the base request */
base_req = new_req;
/* go to the next buffer (ie. the one corresponding to the next recv) */
tmpbuf[lr] += realsegsize[lr];
} /* end of for segindex */
/* wait for the last segment and forward current segment */
err = ompi_request_wait_all( 1, &base_req, MPI_STATUSES_IGNORE );
for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */
MCA_PML_CALL(send(tmpbuf[lr], sendcount[lr], datatype,
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
} /* end of for each child */
}
/* leaf nodes */
else {
/* Create the pipeline. We first post the first receive, then in the loop we
* post the next receive and after that wait for the previous receive to complete
* and we disseminating the data to all children.
*/
sendcount[lr] = segcount[lr];
for (segindex = 0; segindex < num_segments[lr]; segindex++) {
/* determine how many elements to expect in this round */
if (segindex == (num_segments[lr] - 1)) sendcount[lr] = counts[lr] - segindex*segcount[lr];
/* receive segments */
MCA_PML_CALL(recv(tmpbuf[lr], sendcount[lr], datatype,
tree->tree_prev, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
/* update the initial pointer to the buffer */
tmpbuf[lr] += realsegsize[lr];
}
}
/* reset the buffer pointers */
tmpbuf[0] = (char *) buffer;
tmpbuf[1] = (char *) buffer+counts[0] * type_extent;
/* Step 2:
Find your immediate pair (identical node in opposite subtree) and SendRecv
data buffer with them.
The tree building function ensures that
if (we are not root)
if we are in the left subtree (lr == 0) our pair is (rank+1)%size.
if we are in the right subtree (lr == 1) our pair is (rank-1)%size
If we have even number of nodes the rank (size-1) will pair up with root.
*/
if (lr == 0) {
pair = (rank+1)%size;
} else {
pair = (rank+size-1)%size;
}
if ( (size%2) != 0 && rank != root) {
err = ompi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
} else if ( (size%2) == 0 ) {
/* root sends right buffer to the last node */
if( rank == root ) {
MCA_PML_CALL(send(tmpbuf[1], counts[1], datatype,
(root+size-1)%size, MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
/* last node receives right buffer from the root */
else if (rank == size - 1) {
MCA_PML_CALL(recv(tmpbuf[1], counts[1], datatype,
root, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE));
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
/* everyone else exchanges buffers */
else {
err = ompi_mpi_sendrecv( tmpbuf[lr], counts[lr], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
pair, MCA_COLL_BASE_TAG_BCAST,
comm, MPI_STATUS_IGNORE );
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
}
}
return (MPI_SUCCESS);
error_hndl:
fprintf(stderr,"[%d]%s:%d: Error %d occurred\n",rank,__FILE__,line,err);
return (err);
}

Просмотреть файл

@ -0,0 +1,62 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "coll_tuned.h"
#include "mpi.h"
#include "include/constants.h"
#include "datatype/datatype.h"
#include "communicator/communicator.h"
#include "mca/coll/coll.h"
#include "mca/coll/base/coll_tags.h"
#include "coll_tuned.h"
#include "mca/pml/pml.h"
#include "opal/util/bit_ops.h"
/*
* bcast_intra_dec
*
* Function: - seletects broadcast algorithm to use
* Accepts: - same arguments as MPI_Bcast()
* Returns: - MPI_SUCCESS or error code (passed from the bcast implementation)
*/
int mca_coll_tuned_bcast_intra_dec(void *buff, int count,
struct ompi_datatype_t *datatype, int root,
struct ompi_communicator_t *comm)
{
int i;
int size;
int rank;
int err;
int contig;
int dsize;
printf("mca_coll_tuned_bcast_intra_dec\n");
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
/* err = mca_coll_tuned_bcast_intra_linear (buff, count, datatype, root, comm); */
/* err = mca_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, (8192)); */
/* err = mca_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, (8192)); */
err = mca_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, (8192));
return err;
}

125
ompi/mca/coll/tuned/coll_tuned_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,125 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "ompi_config.h"
#include "coll_tuned.h"
#include "mpi.h"
#include "mca/coll/coll.h"
#include "coll_tuned.h"
/*
* Public string showing the coll ompi_tuned component version number
*/
const char *mca_coll_tuned_component_version_string =
"Open MPI tuned collective MCA component version " OMPI_VERSION;
/*
* Global variable
*/
int mca_coll_tuned_priority_param = -1;
int mca_coll_tuned_init_tree_fanout_param = -1;
int mca_coll_tuned_init_chain_fanout_param = -1;
/*
* Local function
*/
static int tuned_open(void);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
const mca_coll_base_component_1_0_0_t mca_coll_tuned_component = {
/* First, the mca_component_t struct containing meta information
about the component itself */
{
/* Indicate that we are a coll v1.0.0 component (which also implies a
specific MCA version) */
MCA_COLL_BASE_VERSION_1_0_0,
/* Component name and version */
"tuned",
OMPI_MAJOR_VERSION,
OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION,
/* Component open and close functions */
tuned_open,
NULL
},
/* Next the MCA v1.0.0 component meta data */
{
/* Whether the component is checkpointable or not */
true
},
/* Initialization / querying functions */
mca_coll_tuned_init_query,
mca_coll_tuned_comm_query,
NULL
};
static int tuned_open(void)
{
/* mca_coll_tuned_component_t *ct = &mca_coll_tuned_component; */
/* Use a low priority, but allow other components to be lower */
mca_coll_tuned_priority_param =
mca_base_param_register_int("coll", "tuned", "priority", NULL, 30);
mca_coll_tuned_init_tree_fanout_param =
mca_base_param_register_int("coll", "tuned", "init_tree_fanout", NULL, 2);
mca_coll_tuned_init_chain_fanout_param =
mca_base_param_register_int("coll", "tuned", "init_chain_fanout", NULL, 4);
printf("mca_coll_tuned_init_tree_fanout_param %d\nmca_coll_tuned_init_chain_fanout_param %d\n", mca_coll_tuned_init_tree_fanout_param,
mca_coll_tuned_init_chain_fanout_param);
/* use the newer interface rsn */
/* mca_coll_tuned_priority_param = mca_base_param_reg_int(&(ct->super), "priority", "Priority of the tuned coll component", */
/* false, false, 30, NULL); */
/* mca_base_param_reg_int(&(ct->super), "init_tree_fanout", "Fan out used for [balanced] tree topologies in the tuned coll component", */
/* false, false, 2, NULL); */
/* mca_base_param_reg_int(&(ct->super), "init_chain_fanout", */
/* "Fan out used for chain [1 fanout followed by pipelines] topology in the tuned coll component", */
/* false, false, 2, NULL); */
return OMPI_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,306 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "coll_tuned.h"
#include "mpi.h"
#include "ompi/include/constants.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "coll_tuned_dynamic_rules.h"
int static rule_count = 0;
int static cond_count = 0;
/* makes a rule */
/* this is hanging in space until linked to either other rules or a */
/* collective decision function */
rule_t* mk_rule ()
{
rule_t* ptr;
ptr = (rule_t*) calloc (1, sizeof(rule_t));
if (!ptr) {
fprintf(stderr,"calloc on mk_rule failed!\n");
exit (-1);
}
/* set values in the hanging rule */
ptr->rule_id = rule_count++;
return (ptr);
}
/* adds a condition to a rule */
int mk_and_add_condition_to_rule (rule_t* rule, param_index_t param,
condition_op_t op, int target)
{
condition_t* ptr;
condition_t* last;
if (!rule) {
fprintf(stderr,"rule given in add_condition_to_rule is NULL?!\n");
return (-2);
}
if (param>=PARAMS) {
fprintf(stderr,"param given in add_condition_to_rule is %d?!\n", param);
return (-3);
}
ptr = (condition_t*) calloc (1, sizeof(condition_t));
if (!ptr) {
fprintf(stderr,"calloc on add_condition_to_rule failed!\n");
return (-5);
}
/* set values in the condition */
ptr->cond_id = cond_count++;
ptr->param = param;
ptr->op = op;
ptr->value = target;
/* set values in the rule */
if (rule->nconditions) { /* if we already have conditions add to the last one */
last = rule->last_condition;
last->next = ptr;
}
else { /* its the very first / head condition */
rule->first_condition = ptr;
}
/* common to both all cases */
rule->nconditions++;
rule->last_condition = ptr;
return (0);
}
/* attaches a rules/collective functions TO a rule (not otherway round) */
int set_rule_links (rule_t * rule, ifp true_fptr, int* true_extraargs,
ifp false_fptr, int* false_extraargs,
rule_t* true_rule, rule_t* false_rule)
{
if (!rule) {
fprintf(stderr,"rule given in set_rule_links is NULL?!\n");
return (-2);
}
/* check rule results.. we must have one set for true and one for false */
if ((true_fptr)&&(true_rule)) {
fprintf(stderr,"BAD. Two links for TRUE on rule %d!\n", rule->rule_id);
return (-6);
}
if ((false_fptr)&&(false_rule)) {
fprintf(stderr,"BAD. Two links for FALSE on rule %d!\n", rule->rule_id);
return (-7);
}
if ((!true_fptr)&&(!true_rule)) {
fprintf(stderr,"BAD. NO links for TRUE on rule %d!\n", rule->rule_id);
return (-8);
}
if ((!false_fptr)&&(!false_rule)) {
fprintf(stderr,"BAD. NO links for FALSE on rule %d!\n", rule->rule_id);
return (-9);
}
/* can set the links now */
rule->true_fptr = true_fptr;
rule->true_extraargs = true_extraargs;
rule->next_true_rule = (struct rules_s *) true_rule;
rule->false_fptr = false_fptr;
rule->false_extraargs = false_extraargs;
rule->next_false_rule = (struct rules_s *) false_rule;
return (0);
}
/* free rule (and all attached conditions */
/* oneday multiple rules might be able to share conditions to save memory */
int free_rule (rule_t *rule)
{
condition_t* ptr;
condition_t* next;
int i;
if (!rule) {
fprintf(stderr,"rule given in free_rule is NULL?!\n");
return (-2);
}
/* free conditions first */
if (rule->nconditions) {
ptr = rule->first_condition;
for (i=0;i<rule->nconditions;i++) {
next = ptr->next;
free (ptr);
ptr = next;
}
}
/* all conditions freed, free the rule */
free (rule);
return (0);
}
/* evaluates a rule and returns the final function pointer that matches */
int eval_rule (rule_t *rule, params_t *params, ifp* fptr, int** extraargs)
{
rule_t* currentrule;
condition_t* currentcond;
int true=1;
if (!rule) {
fprintf(stderr,"rule given in eval_rule is NULL?!\n");
return (-2);
}
/* first special case is a very fast path... sorta not really grr */
if (!rule->nconditions) {
#ifdef VERBOSE
printf("Rule %d has no conditions so forcing first available\n",
rule->rule_id);
#endif /* VERBOSE */
*fptr = rule->true_fptr;
return (0);
}
/* ok we have some conditions so start the evaluation */
/* make it as lazy as possible, so a single false condition bumps us */
currentrule = rule;
while (currentrule) { /* rules to evaluate */
#ifdef RULEVERBOSE
printf("Eval Rule %d ", currentrule->rule_id);
#endif
/* eval each of the current rules conditions */
/* we do this until we have either: */
/* (a) completed all conditions and are still true */
/* (b) found a single false */
/* once we have either a true true or false we can then check */
/* the next values and see if we are returning a function */
/* OR skipping to the next rule and hense iterate */
true = 1; /* we are ok so far */
/* first get the first condition to eval */
currentcond = currentrule->first_condition;
while ((currentcond)&&(true)) { /* while conditions to eval */
#ifdef RULEVERBOSE
printf("Eval Cond %d ", currentcond->cond_id);
#endif
switch (currentcond->op) {
case LT: if (params->values[currentcond->param] < currentcond->value) {true = 1;}
else {true = 0;}
break;
case LTEQ: if (params->values[currentcond->param] <= currentcond->value) {true = 1;}
else {true = 0;}
break;
case GT: if (params->values[currentcond->param] > currentcond->value) {true = 1;}
else {true = 0;}
break;
case GTEQ: if (params->values[currentcond->param] >= currentcond->value) {true = 1;}
else {true = 0;}
break;
case EQ: if (params->values[currentcond->param] == currentcond->value) {true = 1;}
else {true = 0;}
break;
default:
fprintf(stderr, "Eval: BAD operator of value %d rule %d cond %d\n",
currentcond->op, currentrule->rule_id,
currentcond->cond_id);
true = 0;
return (-1); /* ?! what else can I do, should have caught before */
} /* switch on condition operator */
/* if we are still true we go to the next condition if there is one */
/* if there is not another then we are truely true */
/* else if we are false, immediately fall out */
if (!true) {
#ifdef RULEVERBOSE
printf("Eval Cond %d returned FALSE\n", currentcond->cond_id);
#endif
break; /* if false drop out asap */
}
if ((true)&&(currentcond->next)) { /* next condition to check */
#ifdef RULEVERBOSE
printf("Eval Cond %d returned TRUE. Moving to next\n", currentcond->cond_id);
#endif
currentcond = currentcond->next;
}
else { /* we are true with no more conditions to check */
#ifdef RULEVERBOSE
printf("Eval Cond %d (LAST) returned TRUE.\n", currentcond->cond_id);
#endif
break; /* so return so we can find out what to do next */
}
} /* while conditions to eval or a false */
/* condition is either fully met or lazy false */
/* we do these IFs in the fasted/most important order */
if ((true)&&(currentrule->true_fptr)) {
#ifdef RULEVERBOSE
printf("Eval Rule %d is TRUE returning fptr\n", currentrule->rule_id);
#endif
*fptr = currentrule->true_fptr;
*extraargs = currentrule->true_extraargs;
return (0);
}
if ((!true)&&(currentrule->false_fptr)) {
#ifdef RULEVERBOSE
printf("Eval Rule %d is FALSE returning fptr\n", currentrule->rule_id);
#endif
*fptr = currentrule->false_fptr;
*extraargs = currentrule->false_extraargs;
return (0);
}
if (true) {
#ifdef RULEVERBOSE
printf("Eval Rule %d is TRUE jumping to next rule %d\n",
currentrule->rule_id,
((rule_t*)(currentrule->next_true_rule))->rule_id);
#endif
currentrule = (rule_t *) currentrule->next_true_rule;
}
else { /* i.e. not true / lazy eval */
#ifdef RULEVERBOSE
printf("Eval Rule %d is FALSE jumping to next rule %d\n",
currentrule->rule_id,
((rule_t*)(currentrule->next_false_rule))->rule_id);
#endif
currentrule = (rule_t *) currentrule->next_false_rule;
}
if (!currentrule) {
fprintf(stderr, "eval: disaster, we have gone off into the weeds.. panic!\n");
exit (-10);
}
}
}

Просмотреть файл

@ -0,0 +1,119 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_DYNAMIC_RULES_EXPORT_H
#define MCA_COLL_TUNED_DYNAMIC_RULES_EXPORT_H
#include "ompi_config.h"
#include "coll_tuned.h"
#include "mpi.h"
#include "ompi/include/constants.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/* type is prob not needed - GEF */
typedef enum {
RULE,
FPTR
} type_t;
typedef enum {
LT,
LTEQ,
GT,
GTEQ,
EQ
} condition_op_t;
typedef enum {
COMMSIZE,
COMMPOWER2,
DATASIZE,
DATAPOWER2,
DATACONTG,
NONZEROROOT
} param_index_t;
#define PARAMS 6
typedef struct {
int values[PARAMS];
} params_t;
typedef struct condition_s {
int cond_id;
param_index_t param;
condition_op_t op;
int value;
struct condition_s *next;
} condition_t;
typedef int (*ifp)();
typedef struct rule_s {
int rule_id;
/* type_t type; */
/* not sure if we need different types of RULEs yet? */
/* maybe a faster single condition rule to avoid loops and more ifs */
/* current eval thinks that a nconditions value of 0 = true fptr */
int nconditions;
/* we have a ptr to first and last just to speed up eval and add_to */
condition_t *first_condition;
condition_t *last_condition;
ifp true_fptr;
int* true_extraargs;
ifp false_fptr;
int* false_extraargs;
struct rules_s* next_true_rule;
struct rules_s* next_false_rule;
} rule_t;
rule_t* mk_rule ();
int mk_and_add_condition_to_rule (rule_t* rule, param_index_t param,
condition_op_t op, int target);
int set_rule_links (rule_t * rule, ifp true_fptr, int* true_extraargs,
ifp false_fptr, int* false_extraargs,
rule_t* true_rule, rule_t* false_rule);
int free_rule (rule_t *rule);
int eval_rule (rule_t* rule, params_t* params, ifp* fptr, int** extraargs);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* MCA_COLL_TUNED_DYNAMIC_RULES_EXPORT_H */

276
ompi/mca/coll/tuned/coll_tuned_module.c Обычный файл
Просмотреть файл

@ -0,0 +1,276 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "coll_tuned.h"
#include <stdio.h>
#include "mpi.h"
#include "communicator/communicator.h"
#include "mca/base/mca_base_param.h"
#include "mca/coll/coll.h"
#include "mca/coll/base/base.h"
#include "coll_tuned.h"
#include "coll_tuned_topo.h"
/*
* Which set are we using?
*/
static const mca_coll_base_module_1_0_0_t *to_use = NULL;
/*
* Intra communicator decision functions
*/
static const mca_coll_base_module_1_0_0_t intra = {
/* Initialization / finalization functions */
mca_coll_tuned_module_init,
mca_coll_tuned_module_finalize,
/* Collective function pointers */
/* mca_coll_tuned_allgather_intra_dec, */
NULL,
/* mca_coll_tuned_allgatherv_intra_dec, */
NULL,
/* mca_coll_tuned_allreduce_intra_dec, */
NULL,
/* mca_coll_tuned_alltoall_intra_dec, */
NULL,
/* mca_coll_tuned_alltoallv_intra_dec, */
NULL,
/* mca_coll_tuned_alltoallw_intra_dec, */
NULL,
/* mca_coll_tuned_barrier_intra_dec, */
NULL,
mca_coll_tuned_bcast_intra_dec,
/* NULL, */
/* mca_coll_tuned_exscan_intra_dec, */
NULL,
/* mca_coll_tuned_gather_intra_dec, */
NULL,
/* mca_coll_tuned_gatherv_intra_dec, */
NULL,
/* mca_coll_tuned_reduce_intra_dec, */
NULL,
/* mca_coll_tuned_reduce_scatter_intra_dec, */
NULL,
/* mca_coll_tuned_scan_intra_dec, */
NULL,
/* mca_coll_tuned_scatter_intra_dec, */
NULL,
/* mca_coll_tuned_scatterv_intra_dec */
NULL
};
/*
* collective decision functions for intercommunicators
*/
static const mca_coll_base_module_1_0_0_t inter = {
/* Initialization / finalization functions */
mca_coll_tuned_module_init,
mca_coll_tuned_module_finalize,
/* Collective function pointers */
/* mca_coll_tuned_allgather_inter_dec, */
NULL,
/* mca_coll_tuned_allgatherv_inter_dec, */
NULL,
/* mca_coll_tuned_allreduce_inter_dec, */
NULL,
/* mca_coll_tuned_alltoall_inter_dec, */
NULL,
/* mca_coll_tuned_alltoallv_inter_dec, */
NULL,
/* mca_coll_tuned_alltoallw_inter_dec, */
NULL,
/* mca_coll_tuned_barrier_inter_dec, */
NULL,
/* mca_coll_tuned_bcast_inter_dec, */
NULL,
/* mca_coll_tuned_exscan_inter_dec, */
NULL,
/* mca_coll_tuned_gather_inter_dec, */
NULL,
/* mca_coll_tuned_gatherv_inter_dec, */
NULL,
/* mca_coll_tuned_reduce_inter_dec, */
NULL,
/* mca_coll_tuned_reduce_scatter_inter_dec, */
NULL,
/* mca_coll_tuned_scan_inter_dec, */
NULL,
/* mca_coll_tuned_scatter_inter_dec, */
NULL,
/* mca_coll_tuned_scatterv_inter_dec */
NULL
};
/* Note I keep the names here as place markers until I have implemented the functions */
/*
* Initial query function that is invoked during MPI_INIT, allowing
* this component to disqualify itself if it doesn't support the
* required level of thread support.
*/
int mca_coll_tuned_init_query(bool enable_progress_threads,
bool enable_mpi_threads)
{
/* Nothing to do */
return OMPI_SUCCESS;
}
/*
* Invoked when there's a new communicator that has been created.
* Look at the communicator and decide which set of functions and
* priority we want to return.
*/
const mca_coll_base_module_1_0_0_t *
mca_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority,
struct mca_coll_base_comm_t **data)
{
printf("Tuned query called\n");
if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_tuned_priority_param,
priority)) {
return NULL;
}
/* Choose whether to use [intra|inter] decision functions */
if (OMPI_COMM_IS_INTER(comm)) {
to_use = &inter;
} else {
to_use = &intra;
}
return to_use;
}
/*
* Init module on the communicator
*/
const struct mca_coll_base_module_1_0_0_t *
mca_coll_tuned_module_init(struct ompi_communicator_t *comm)
{
int size;
struct mca_coll_base_comm_t *data;
/* fanout parameters */
int tree_fanout_default = 2;
int chain_fanout_default = 4;
printf("Tuned init module called.\n");
/* This routine will become more complex and might have to be broken into sections */
/* Order of operations:
* alloc memory for nb reqs (in case we fall through)
* add decision rules if using dynamic rules
* compact rules using communicator size info etc
* build first guess cached topologies (might depend on the rules from above)
*
* then attach all to the communicator and return base module funct ptrs
*/
/* Allocate the data that hangs off the communicator */
if (OMPI_COMM_IS_INTER(comm)) {
size = ompi_comm_remote_size(comm);
} else {
size = ompi_comm_size(comm);
}
data = malloc(sizeof(struct mca_coll_base_comm_t) +
(sizeof(ompi_request_t *) * size * 2));
if (NULL == data) {
return NULL;
}
data->mccb_reqs = (ompi_request_t **) (data + 1);
data->mccb_num_reqs = size * 2;
/*
* now for the cached topo functions
* guess the initial topologies to use rank 0 as root
*/
/* get default fanouts is made available via the MCA */
tree_fanout_default = 2; /* make it binary for now */
chain_fanout_default = 4;
data->cached_tree = ompi_coll_tuned_topo_build_tree (tree_fanout_default, comm, 0);
data->cached_tree_root = 0;
data->cached_tree_fanout = tree_fanout_default;
data->cached_bmtree = ompi_coll_tuned_topo_build_bmtree (comm, 0);
data->cached_tree_root = 0;
/*
* chains (fanout followed by pipelines)
* are more difficuilt as the fan out really really depends on message size [sometimes]..
* as size gets larger fan-out gets smaller [usually]
*
* will probably change how we cache this later, for now a midsize
* GEF
*/
data->cached_chain = ompi_coll_tuned_topo_build_chain (chain_fanout_default, comm, 0);
data->cached_chain_root = 0;
data->cached_chain_fanout = chain_fanout_default;
/* All done */
comm->c_coll_selected_data = data;
printf("Tuned looks like it is in use :)\n");
return to_use;
}
/*
* Finalize module on the communicator
*/
int mca_coll_tuned_module_finalize(struct ompi_communicator_t *comm)
{
if (NULL == comm->c_coll_selected_module) {
return OMPI_SUCCESS;
}
#if OMPI_ENABLE_DEBUG
/* Reset the reqs to NULL/0 -- they'll be freed as part of freeing
the generel c_coll_selected_data */
comm->c_coll_selected_data->mccb_reqs = NULL;
comm->c_coll_selected_data->mccb_num_reqs = 0;
#endif
/* All done */
free(comm->c_coll_selected_data);
comm->c_coll_selected_data = NULL;
return OMPI_SUCCESS;
}

378
ompi/mca/coll/tuned/coll_tuned_topo.c Обычный файл
Просмотреть файл

@ -0,0 +1,378 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "mpi.h"
#include "ompi/include/constants.h"
#include "datatype/datatype.h"
#include "communicator/communicator.h"
#include "mca/coll/coll.h"
#include "mca/coll/base/coll_tags.h"
#include "mca/pml/pml.h"
#include "coll_tuned_topo.h"
/*
* Some static helpers.
*/
static int pown( int fanout, int num )
{
int j, p = 1;
if( num < 0 ) return 0;
for( j = 0; j < num; j++ ) p*= fanout;
return p;
}
static int calculate_level( int fanout, int rank )
{
int level, num;
if( rank < 0 ) return -1;
for( level = 0, num = 0; num <= rank; level++ ) {
num += pown(fanout, level);
}
return level-1;
}
static int calculate_num_nodes_up_to_level( int fanout, int level )
{
/* just use geometric progression formula for sum:
a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
return ((pown(fanout,level) - 1)/(fanout - 1));
}
/*
* And now the building functions.
*/
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_tree( int fanout,
struct ompi_communicator_t* comm,
int root )
{
int rank, size;
int schild, sparent;
int level; /* location of my rank in the tree structure of size */
int delta; /* number of nodes on my level */
int slimit; /* total number of nodes on levels above me */
int shiftedrank;
int i;
ompi_coll_tree_t* tree;
printf("Building tuned topo tree: fo %d rt %d\n", fanout, root);
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
tree->tree_root = MPI_UNDEFINED;
tree->tree_nextsize = MPI_UNDEFINED;
/*
* Check if we calculated the tree for this root and
* fanout combination already (on this communicator)
*/
if( (root == tree->tree_root) && (fanout == tree->tree_fanout) ) {
return tree;
}
/*
* Set root
*/
tree->tree_root = root;
/*
* Initialize tree
*/
tree->tree_fanout = fanout;
tree->tree_root = root;
tree->tree_prev = -1;
tree->tree_nextsize = 0;
for( i = 0; i < fanout; i++ ) {
tree->tree_next[i] = -1;
}
/* return if we have less than 2 processes */
if( size < 2 ) {
return tree;
}
/*
* Shift all ranks by root, so that the algorithm can be
* designed as if root would be always 0
* shiftedrank should be used in calculating distances
* and position in tree
*/
shiftedrank = rank - root;
if( shiftedrank < 0 ) {
shiftedrank += size;
}
/* calculate my level */
level = calculate_level( fanout, shiftedrank );
delta = pown( fanout, level );
/* find my children */
for( i = 0; i < fanout; i++ ) {
schild = shiftedrank + delta * (i+1);
if( schild < size ) {
tree->tree_next[i] = (schild+root)%size;
tree->tree_nextsize = tree->tree_nextsize + 1;
} else {
break;
}
}
/* find my parent */
slimit = calculate_num_nodes_up_to_level( fanout, level );
sparent = shiftedrank;
if( sparent < fanout ) {
sparent = 0;
} else {
while( sparent >= slimit ) {
sparent -= delta/fanout;
}
}
tree->tree_prev = (sparent+root)%size;
return tree;
}
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
{
return OMPI_SUCCESS;
}
ompi_coll_bmtree_t*
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
int root )
{
int childs = 0;
int rank;
int size;
int mask = 1;
int index;
int remote;
ompi_coll_bmtree_t *bmtree;
printf("Building tuned topo bmtree: rt %d\n", root);
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
index = rank -root;
bmtree = (ompi_coll_bmtree_t*)malloc(sizeof(ompi_coll_bmtree_t));
bmtree->bmtree_root = MPI_UNDEFINED;
bmtree->bmtree_nextsize = MPI_UNDEFINED;
if( bmtree->bmtree_root == root ) {
/* the bmtree was computed before */
return bmtree;
}
if( index < 0 ) index += size;
while( mask <= index ) mask <<= 1;
/* Now I can compute my father rank */
if( root == rank ) {
bmtree->bmtree_prev = root;
} else {
remote = (index ^ (mask >> 1)) + root;
if( remote >= size ) remote -= size;
bmtree->bmtree_prev = remote;
}
/* And now let's fill my childs */
while( mask < size ) {
remote = (index ^ mask);
if( remote >= size ) break;
remote += root;
if( remote >= size ) remote -= size;
bmtree->bmtree_next[childs] = remote;
mask <<= 1;
childs++;
}
bmtree->bmtree_nextsize = childs;
bmtree->bmtree_root = root;
return bmtree;
}
int ompi_coll_tuned_topo_destroy_bmtree( ompi_coll_bmtree_t** bmtree )
{
return OMPI_SUCCESS;
}
ompi_coll_chain_t*
ompi_coll_tuned_topo_build_chain( int fanout,
struct ompi_communicator_t* comm,
int root )
{
int rank, size;
int srank; /* shifted rank */
int i,maxchainlen;
int mark,head,len;
ompi_coll_chain_t *chain;
printf("Building tuned topo chain: fo %d rt %d\n", fanout, root);
/*
* Get size and rank of the process in this communicator
*/
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
if( fanout > MAXTREEFANOUT ) {
return NULL;
}
/*
* Allocate space for topology arrays if needed
*/
chain = (ompi_coll_chain_t*)malloc( sizeof(ompi_coll_chain_t) );
chain->chain_root = MPI_UNDEFINED;
chain->chain_nextsize = -1;
chain->chain_numchain = -1;
/*
* Check if we calculated the topology for this root and comm
*/
if( (root == chain->chain_root) &&
(fanout == chain->chain_numchain) ) {
return chain;
}
/*
* Set root & numchain
*/
chain->chain_root = root;
if( (size - 1) < fanout ) {
chain->chain_numchain = size-1;
chain->chain_nextsize = size-1;
fanout = size-1;
} else {
chain->chain_numchain = fanout;
chain->chain_nextsize = fanout;
}
/*
* Shift ranks
*/
srank = rank - root;
if (srank < 0) srank += size;
/*
* Special case - fanout == 1
*/
if( fanout == 1 ) {
if( srank == 0 ) chain->chain_prev = -1;
else chain->chain_prev = (srank-1+root)%size;
if( (srank + 1) >= size) {
chain->chain_next[0] = -1;
chain->chain_nextsize = 0;
} else {
chain->chain_next[0] = (srank+1+root)%size;
chain->chain_nextsize = 1;
}
return chain;
}
/* Let's handle the case where there is just one node in the communicator */
if( size == 1 ) {
chain->chain_next[0] = -1;
chain->chain_nextsize = 0;
chain->chain_prev = -1;
chain->chain_numchain = 0;
return chain;
}
/*
* Calculate maximum chain length
*/
maxchainlen = (size-1) / fanout;
if( (size-1) % fanout != 0 ) {
maxchainlen++;
mark = (size-1)%fanout;
} else {
mark = fanout+1;
}
/*
* Find your own place in the list of shifted ranks
*/
if( srank != 0 ) {
int column;
if( srank-1 < (mark * maxchainlen) ) {
column = (srank-1)/maxchainlen;
head = 1+column*maxchainlen;
len = maxchainlen;
} else {
column = mark + (srank-1-mark*maxchainlen)/(maxchainlen-1);
head = mark*maxchainlen+1+(column-mark)*(maxchainlen-1);
len = maxchainlen-1;
}
if( srank == head ) {
chain->chain_prev = 0; /*root*/
} else {
chain->chain_prev = srank-1; /* rank -1 */
}
if( srank == (head + len - 1) ) {
chain->chain_next[0] = -1;
chain->chain_nextsize = 0;
} else {
if( (srank + 1) < size ) {
chain->chain_next[0] = srank+1;
chain->chain_nextsize = 1;
} else {
chain->chain_next[0] = -1;
chain->chain_nextsize = 0;
}
}
}
/*
* Unshift values
*/
if( rank == root ) {
chain->chain_prev = -1;
chain->chain_next[0] = (root+1)%size;
for( i = 1; i < fanout; i++ ) {
chain->chain_next[i] = chain->chain_next[i-1] + maxchainlen;
if( i > mark ) {
chain->chain_next[i]--;
}
chain->chain_next[i] %= size;
}
chain->chain_nextsize = fanout;
} else {
chain->chain_prev = (chain->chain_prev+root)%size;
if( chain->chain_next[0] != -1 ) {
chain->chain_next[0] = (chain->chain_next[0]+root)%size;
}
}
return chain;
}
int ompi_coll_tuned_topo_destroy_chain( ompi_coll_chain_t** chain )
{
return OMPI_SUCCESS;
}

74
ompi/mca/coll/tuned/coll_tuned_topo.h Обычный файл
Просмотреть файл

@ -0,0 +1,74 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED
#define MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED
#include "ompi_config.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C"
{
#endif
#define MAXTREEFANOUT 32
typedef struct ompi_coll_tree_t {
int32_t tree_root;
int32_t tree_fanout;
int32_t tree_prev;
int32_t tree_next[MAXTREEFANOUT];
int32_t tree_nextsize;
} ompi_coll_tree_t;
typedef struct ompi_coll_bmtree_t {
int32_t bmtree_root;
int32_t bmtree_prev;
int32_t bmtree_next[MAXTREEFANOUT];
int32_t bmtree_nextsize;
} ompi_coll_bmtree_t;
typedef struct ompi_coll_chain_t {
int32_t chain_root;
int32_t chain_prev;
int32_t chain_next[MAXTREEFANOUT];
int32_t chain_nextsize;
int32_t chain_numchain;
} ompi_coll_chain_t;
ompi_coll_tree_t*
ompi_coll_tuned_topo_build_tree( int fanout,
struct ompi_communicator_t* com,
int root );
int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree );
ompi_coll_bmtree_t*
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
int root );
int ompi_coll_tuned_topo_destroy_bmtree( ompi_coll_bmtree_t** bmtree );
ompi_coll_chain_t*
ompi_coll_tuned_topo_build_chain( int fanout,
struct ompi_communicator_t* com,
int root );
int ompi_coll_tuned_topo_destroy_chain( ompi_coll_chain_t** chain );
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif /* MCA_COLL_TUNED_TOPO_H_HAS_BEEN_INCLUDED */

21
ompi/mca/coll/tuned/configure.params Обычный файл
Просмотреть файл

@ -0,0 +1,21 @@
# -*- shell-script -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
PARAM_INIT_FILE=coll_tuned.c
PARAM_CONFIG_FILES=Makefile