/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include "mpi.h" #include "ompi/constants.h" #include "ompi/datatype/datatype.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "coll_tuned.h" #include "coll_tuned_topo.h" /* * Some static helpers. */ static int pown( int fanout, int num ) { int j, p = 1; if( num < 0 ) return 0; if (1==num) return fanout; if (2==fanout) { return p<MAXTREEFANOUT) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT)); return NULL; } /* * Get size and rank of the process in this communicator */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); if (!tree) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo_build_tree PANIC::out of memory")); return NULL; } tree->tree_root = MPI_UNDEFINED; tree->tree_nextsize = MPI_UNDEFINED; /* * Set root */ tree->tree_root = root; /* * Initialize tree */ tree->tree_fanout = fanout; tree->tree_bmtree = 0; tree->tree_root = root; tree->tree_prev = -1; tree->tree_nextsize = 0; for( i = 0; i < fanout; i++ ) { tree->tree_next[i] = -1; } /* return if we have less than 2 processes */ if( size < 2 ) { return tree; } /* * Shift all ranks by root, so that the algorithm can be * designed as if root would be always 0 * shiftedrank should be used in calculating distances * and position in tree */ shiftedrank = rank - root; if( shiftedrank < 0 ) { shiftedrank += size; } /* calculate my level */ level = calculate_level( fanout, shiftedrank ); delta = pown( fanout, level ); /* find my children */ for( i = 0; i < fanout; i++ ) { schild = shiftedrank + delta * (i+1); if( schild < size ) { tree->tree_next[i] = (schild+root)%size; tree->tree_nextsize = tree->tree_nextsize + 1; } else { break; } } /* find my parent */ slimit = calculate_num_nodes_up_to_level( fanout, level ); sparent = shiftedrank; if( sparent < fanout ) { sparent = 0; } else { while( sparent >= slimit ) { sparent -= delta/fanout; } } tree->tree_prev = (sparent+root)%size; return tree; } /* * Constructs in-order binary tree which can be used for non-commutative reduce * operations. * Root of this tree is always rank (size-1) and fanout is 2. * Here are some of the examples of this tree: * size == 2 size = 4 size = 9 * 1 3 8 * / / \ / \ * 0 2 1 7 3 * / / \ / \ * 0 6 5 2 1 * / / * 4 0 */ ompi_coll_tree_t* ompi_coll_tuned_topo_build_in_order_bintree( struct ompi_communicator_t* comm ) { int rank, size; int myrank, rightsize, delta; int parent, lchild, rchild; ompi_coll_tree_t* tree; /* * Get size and rank of the process in this communicator */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); if (!tree) { OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_tree PANIC::out of memory")); return NULL; } tree->tree_root = MPI_UNDEFINED; tree->tree_nextsize = MPI_UNDEFINED; /* * Initialize tree */ tree->tree_fanout = 2; tree->tree_bmtree = 0; tree->tree_root = size - 1; tree->tree_prev = -1; tree->tree_nextsize = 0; tree->tree_next[0] = -1; tree->tree_next[1] = -1; OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo_build_in_order_tree Building fo %d rt %d", tree->tree_fanout, tree->tree_root)); /* * Build the tree */ myrank = rank; parent = size - 1; delta = 0; while ( 1 ) { /* Compute the size of the right subtree */ rightsize = size >> 1; /* Determine the left and right child of this parent */ lchild = -1; rchild = -1; if (size - 1 > 0) { lchild = parent - 1; if (lchild > 0) { rchild = rightsize - 1; } } /* The following cases are possible: myrank can be - a parent, - belong to the left subtree, or - belong to the right subtee Each of the cases need to be handled differently. */ if (myrank == parent) { /* I am the parent: - compute real ranks of my children, and exit the loop. */ if (lchild >= 0) tree->tree_next[0] = lchild + delta; if (rchild >= 0) tree->tree_next[1] = rchild + delta; break; } if (myrank > rchild) { /* I belong to the left subtree: - If I am the left child, compute real rank of my parent - Iterate down through tree: compute new size, shift ranks down, and update delta. */ if (myrank == lchild) { tree->tree_prev = parent + delta; } size = size - rightsize - 1; delta = delta + rightsize; myrank = myrank - rightsize; parent = size - 1; } else { /* I belong to the right subtree: - If I am the right child, compute real rank of my parent - Iterate down through tree: compute new size and parent, but the delta and rank do not need to change. */ if (myrank == rchild) { tree->tree_prev = parent + delta; } size = rightsize; parent = rchild; } } if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; } if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; } return tree; } int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree ) { ompi_coll_tree_t *ptr; if ((!tree)||(!*tree)) { return OMPI_SUCCESS; } ptr = *tree; free (ptr); *tree = NULL; /* mark tree as gone */ return OMPI_SUCCESS; } ompi_coll_tree_t* ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm, int root ) { int childs = 0; int rank; int size; int mask = 1; int index; int remote; ompi_coll_tree_t *bmtree; int i; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree rt %d", root)); /* * Get size and rank of the process in this communicator */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); index = rank -root; bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t)); if (!bmtree) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree PANIC out of memory")); return NULL; } bmtree->tree_bmtree = 1; bmtree->tree_root = MPI_UNDEFINED; bmtree->tree_nextsize = MPI_UNDEFINED; for(i=0;itree_next[i] = -1; } if( index < 0 ) index += size; while( mask <= index ) mask <<= 1; /* Now I can compute my father rank */ if( root == rank ) { bmtree->tree_prev = root; } else { remote = (index ^ (mask >> 1)) + root; if( remote >= size ) remote -= size; bmtree->tree_prev = remote; } /* And now let's fill my childs */ while( mask < size ) { remote = (index ^ mask); if( remote >= size ) break; remote += root; if( remote >= size ) remote -= size; if (childs==MAXTREEFANOUT) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs)); return NULL; } bmtree->tree_next[childs] = remote; mask <<= 1; childs++; } bmtree->tree_nextsize = childs; bmtree->tree_root = root; return bmtree; } ompi_coll_tree_t* ompi_coll_tuned_topo_build_chain( int fanout, struct ompi_communicator_t* comm, int root ) { int rank, size; int srank; /* shifted rank */ int i,maxchainlen; int mark,head,len; ompi_coll_tree_t *chain; OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain fo %d rt %d", fanout, root)); /* * Get size and rank of the process in this communicator */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); if( fanout < 1 ) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!")); fanout = 1; } if (fanout>MAXTREEFANOUT) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT)); fanout = MAXTREEFANOUT; } /* * Allocate space for topology arrays if needed */ chain = (ompi_coll_tree_t*)malloc( sizeof(ompi_coll_tree_t) ); if (!chain) { OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:build_chain PANIC out of memory")); fflush(stdout); return NULL; } chain->tree_root = MPI_UNDEFINED; chain->tree_nextsize = -1; for(i=0;itree_next[i] = -1; /* * Set root & numchain */ chain->tree_root = root; if( (size - 1) < fanout ) { chain->tree_nextsize = size-1; fanout = size-1; } else { chain->tree_nextsize = fanout; } /* * Shift ranks */ srank = rank - root; if (srank < 0) srank += size; /* * Special case - fanout == 1 */ if( fanout == 1 ) { if( srank == 0 ) chain->tree_prev = -1; else chain->tree_prev = (srank-1+root)%size; if( (srank + 1) >= size) { chain->tree_next[0] = -1; chain->tree_nextsize = 0; } else { chain->tree_next[0] = (srank+1+root)%size; chain->tree_nextsize = 1; } return chain; } /* Let's handle the case where there is just one node in the communicator */ if( size == 1 ) { chain->tree_next[0] = -1; chain->tree_nextsize = 0; chain->tree_prev = -1; return chain; } /* * Calculate maximum chain length */ maxchainlen = (size-1) / fanout; if( (size-1) % fanout != 0 ) { maxchainlen++; mark = (size-1)%fanout; } else { mark = fanout+1; } /* * Find your own place in the list of shifted ranks */ if( srank != 0 ) { int column; if( srank-1 < (mark * maxchainlen) ) { column = (srank-1)/maxchainlen; head = 1+column*maxchainlen; len = maxchainlen; } else { column = mark + (srank-1-mark*maxchainlen)/(maxchainlen-1); head = mark*maxchainlen+1+(column-mark)*(maxchainlen-1); len = maxchainlen-1; } if( srank == head ) { chain->tree_prev = 0; /*root*/ } else { chain->tree_prev = srank-1; /* rank -1 */ } if( srank == (head + len - 1) ) { chain->tree_next[0] = -1; chain->tree_nextsize = 0; } else { if( (srank + 1) < size ) { chain->tree_next[0] = srank+1; chain->tree_nextsize = 1; } else { chain->tree_next[0] = -1; chain->tree_nextsize = 0; } } } /* * Unshift values */ if( rank == root ) { chain->tree_prev = -1; chain->tree_next[0] = (root+1)%size; for( i = 1; i < fanout; i++ ) { chain->tree_next[i] = chain->tree_next[i-1] + maxchainlen; if( i > mark ) { chain->tree_next[i]--; } chain->tree_next[i] %= size; } chain->tree_nextsize = fanout; } else { chain->tree_prev = (chain->tree_prev+root)%size; if( chain->tree_next[0] != -1 ) { chain->tree_next[0] = (chain->tree_next[0]+root)%size; } } return chain; } int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank) { int i; OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:topo:topo_dump_tree %1d tree root %d" " fanout %d BM %1d nextsize %d prev %d", rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout, tree->tree_nextsize, tree->tree_prev)); if( tree->tree_nextsize ) { for( i = 0; i < tree->tree_nextsize; i++ ) OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d", i, tree->tree_next[i])); } return (0); }