1
1
- update the hierarch stuff to use btl's instead of ptl's
- start the new logic regarding how to handle local leader communicators

This commit was SVN r7691.
Этот коммит содержится в:
Edgar Gabriel 2005-10-11 17:29:59 +00:00
родитель 9a1db9abba
Коммит b42d4ac780
6 изменённых файлов: 125 добавлений и 514 удалений

Просмотреть файл

@ -21,19 +21,23 @@
#include "mpi.h"
#include "communicator/communicator.h"
#include "proc/proc.h"
#include "mca/coll/coll.h"
#include "mca/coll/base/base.h"
#include "coll_hierarch.h"
#include "mca/ptl/ptl.h"
#include "mca/pml/teg/src/pml_teg_proc.h"
#include "mca/pml/teg/src/pml_teg_ptl.h"
#include "class/ompi_bitmap.h"
#include "mca/bml/bml.h"
#include "mca/bml/base/base.h"
#include "mca/pml/pml.h"
#include "mca/btl/btl.h"
/* local functions and data */
#define HIER_MAXPROTOCOL 7
static int mca_coll_hierarch_max_protocol=HIER_MAXPROTOCOL;
static char hier_prot[HIER_MAXPROTOCOL][5]={"0","tcp","ib","gm","mx","elan4","sm"};
static char hier_prot[HIER_MAXPROTOCOL][6]={"0","tcp","gm","mx","mvapi","openib","sm"};
static void mca_coll_hierarch_checkfor_component (struct ompi_communicator_t *comm,
char *component_name, int *key,
@ -113,7 +117,6 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority,
/* This module only works for intra-communicators at the moment */
if ( OMPI_COMM_IS_INTER(comm) ) {
*priority = 0;
return NULL;
}
@ -190,13 +193,11 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority,
const struct mca_coll_base_module_1_0_0_t *
mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
{
int color, ncount;
int *colorarr=NULL, *llr=NULL;
int color;
int *llr=NULL;
int size, rank, ret=OMPI_SUCCESS;
int i, j, c, level;
int found;
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
struct mca_coll_base_comm_t *data=NULL;
rank = ompi_comm_rank(comm);
@ -207,13 +208,13 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
/* Generate the subcommunicator based on the color returned by
the previous function. */
ret = ompi_comm_split ( comm, color, rank, &llcomm, 0 );
ret = ompi_comm_split ( comm, color, rank, &lcomm, 0 );
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
data->hier_comm = comm;
data->hier_llcomm = llcomm;
data->hier_lcomm = lcomm;
data->hier_num_reqs = 2 * size;
data->hier_reqs = (ompi_request_t **) malloc (sizeof(ompi_request_t)*size*2);
if ( NULL == data->hier_reqs ) {
@ -238,6 +239,21 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
data->hier_am_lleader = 1; /*true */
}
/* Generate the lleader communicator assuming that all lleaders are the first
process in the list of processes with the same color. A function generating
other lleader-comms will follow soon. */
ompi_comm_split ( comm, data->hier_am_lleader, rank, &llcomm, 0);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
data->hier_llcomm = (struct ompi_communicator_t *)malloc (HIER_DEFAULT_NUM_LLCOMM *
sizeof(struct ompi_communicator_t *));
if ( NULL == data->hier_llcomm ) {
goto exit;
}
data->hier_num_llcomm = HIER_DEFAULT_NUM_LLCOMM;
data->hier_llcomm[0] = llcomm;
/* This is the point where I will introduce later on a function trying to
compact the colorarr array. Not done at the moment */
@ -248,7 +264,7 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
free (llr);
}
if ( OMPI_SUCCESS != ret ) {
ompi_comm_free ( &llcomm );
ompi_comm_free ( &lcomm );
if ( NULL != data ) {
if ( NULL != data->hier_reqs ) {
free ( data->hier_reqs);
@ -274,19 +290,16 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
*/
int mca_coll_hierarch_module_finalize(struct ompi_communicator_t *comm)
{
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
struct mca_coll_base_comm_t *data=NULL;
data = comm->c_coll_selected_data;
llcomm = data->hier_llcomm;
lcomm = data->hier_lcomm;
ompi_comm_free (&llcomm);
ompi_comm_free (&lcomm);
free ( data->hier_reqs );
free ( data->hier_lleaders );
free ( data->hier_colorarr );
if ( NULL != data->hier_topo.topo_next ) {
free (data->hier_topo.topo_next);
}
free ( data );
comm->c_coll_selected_data = NULL;
@ -316,86 +329,78 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm,
int *key,
int *ncount )
{
mca_pml_proc_t *proc=NULL;
mca_ptl_proc_t *ptl_proc=NULL;
mca_ptl_base_module_t *ptl_module=NULL;
mca_ptl_base_component_t *ptr=NULL;
ompi_bitmap_t reachable;
ompi_proc_t **procs=NULL;
struct mca_bml_base_endpoint_t **bml_endpoints=NULL;
struct mca_bml_base_btl_array_t *bml_btl_array=NULL;
mca_bml_base_btl_t *bml_btl=NULL;
mca_btl_base_component_t *btl=NULL;
int i, j, size;
int i, size, rc;
int counter=0;
int firstproc=999999;
int rank = -1;
int listsize=1;
int use_next, walk_through_list;
int use_rdma=0;
/* default values in case an error occurs */
*ncount=0;
*key=MPI_UNDEFINED;
/* Shall we just check the first element in the ptl list ? */
if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_walk_through_list_param,
&walk_through_list)) {
/* Shall we check the the rdma list instead of send-list in the endpoint-structure? */
/* if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_rdma_param,
&use_rdma)) {
return;
}
/* Shall we use the first_elem list or the next_elem list? */
if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_use_next_param,
&use_next)) {
return;
}
*/
size = ompi_comm_size ( comm );
rank = ompi_comm_rank ( comm );
OBJ_CONSTRUCT(&reachable, ompi_bitmap_t);
rc = ompi_bitmap_init(&reachable, size);
if(OMPI_SUCCESS != rc) {
return;
}
rc = mca_bml.bml_add_procs (
size,
procs,
bml_endpoints,
&reachable
);
if(OMPI_SUCCESS != rc) {
return;
}
for ( i=0; i<size; i++ ) {
if ( rank == i ) {
/* skip myself */
continue;
}
proc = mca_pml_teg_proc_lookup_remote (comm, i);
if ( use_next ) {
ptl_proc=mca_ptl_array_get_next(&proc->proc_ptl_first);
if ( walk_through_list ) {
/*
* Walking through the listmight be unecessary. Assumption is,
* that if we did not register this as the first protocol, there is
* a protocol which is faster than this one.
*
* Example: on a IB-cluster with dual processor nodes, I can talk
* to all procs with IB, however the process on my node will
* hopefully have sm registered as its first protocoll.
*/
listsize = mca_ptl_array_get_size(&proc->proc_ptl_first);
}
if ( use_rdma ) {
bml_btl_array = &(bml_endpoints[i]->btl_rdma);
}
else {
ptl_proc=mca_ptl_array_get_next(&proc->proc_ptl_next);
if ( walk_through_list ) {
listsize = mca_ptl_array_get_size(&proc->proc_ptl_next);
}
bml_btl_array = &(bml_endpoints[i]->btl_send);
}
bml_btl = mca_bml_base_btl_array_get_index ( bml_btl_array, 0 );
btl = bml_btl->btl->btl_component;
for ( j=0; j<listsize;j++) {
ptl_module = ptl_proc->ptl;
ptr = ptl_module->ptl_component;
/* sanity check */
if ( strcmp(btl->btl_version.mca_type_name,"btl") ) {
printf("Oops, got the wrong component! type_name = %s\n",
btl->btl_version.mca_type_name );
}
/* sanity check */
if ( strcmp(ptr->ptlm_version.mca_type_name,"ptl") ) {
printf("Oops, got the wrong component! type_name = %s\n",
ptr->ptlm_version.mca_type_name );
}
/* check for the required component */
if (! strcmp (btl->btl_version.mca_component_name, component_name)){
counter++;
/* check for the required component */
if (! strcmp (ptr->ptlm_version.mca_component_name, component_name)){
counter++;
if (i<firstproc ) {
firstproc = i;
}
if (i<firstproc ) {
firstproc = i;
}
}
}

Просмотреть файл

@ -23,7 +23,6 @@
#include "mca/mca.h"
#include "mca/coll/coll.h"
#include "request/request.h"
#include "mca/pml/pml.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
@ -40,33 +39,32 @@ extern int mca_coll_hierarch_verbose;
extern int mca_coll_hierarch_walk_through_list_param;
extern int mca_coll_hierarch_use_next_param;
#define HIER_DEFAULT_NUM_LLCOMM 5
/*
* Data structure for attaching data to the communicator
*/
struct mca_coll_hierarch_topo {
int topo_root;
int topo_prev;
int topo_nextsize;
int topo_maxsize;
int *topo_next;
struct mca_coll_hierarch_llead {
struct ompi_communicator_t *hier_llcomm; /* local leader communicator */
int hier_num_lleaders; /* number of local leaders */
int *hier_lleaders; /* list of local leaders, ranks in comm */
int hier_my_lleader_on_lcomm; /* rank of my lleader in llcomm */
int hier_am_lleader; /* am I an lleader? */
int hier_my_lleader; /* pos. of my lleader in hier_lleaders */
};
struct mca_coll_base_comm_t {
struct ompi_communicator_t *hier_comm; /* link back to the attached comm */
struct ompi_communicator_t *hier_llcomm; /* low level communicator */
int hier_level; /* level in the hierarchy. just debugging */
int hier_num_lleaders; /* number of local leaders */
int *hier_lleaders; /* list of local leaders, ranks in comm */
int hier_my_lleader; /* pos. of my lleader in hier_lleaders */
int hier_my_lleader_on_llcomm; /* rank of my lleader in llcomm */
int hier_am_lleader; /* am I an lleader? */
int hier_num_reqs; /* num. of requests */
ompi_request_t **hier_reqs; /* list of requests */
int hier_type_colorarr; /* format in which the colorarr is stored */
int hier_num_colorarr; /* size of the colorarr array */
int* hier_colorarr; /* array containing the color of all procs */
struct mca_coll_hierarch_topo hier_topo; /* topology used in the coll ops */
struct ompi_communicator_t *hier_comm; /* link back to the attached comm */
struct ompi_communicator_t *hier_lcomm; /* low level communicator */
struct mca_coll_hierarch_llead *hier_llead; /* structure for lleader communicator */
int hier_num_llead; /* number of llead structs */
int hier_level; /* level in the hierarchy. just debugging */
int hier_num_reqs; /* num. of requests */
ompi_request_t **hier_reqs; /* list of requests */
int hier_type_colorarr; /* format in which the colorarr is stored */
int hier_num_colorarr; /* size of the colorarr array */
int* hier_colorarr; /* array containing the color of all procs */
};
/* These are various modes how the colorarr is stored. The reason
@ -163,6 +161,13 @@ static inline void mca_coll_hierarch_get_all_lleaders ( int size, int *carr, int
return;
}
static inline struct ompi_communicator_t* mca_coll_hierarch_get_llcomm ( int rank,
struct mca_coll_base_comm_t *data,
int* lleader )
{
return NULL;
}
static inline void mca_coll_hierarch_get_lleader (int rank, struct mca_coll_base_comm_t *data,
int* lleader )
{

Просмотреть файл

@ -35,7 +35,7 @@
int mca_coll_hierarch_barrier_intra(struct ompi_communicator_t *comm)
{
opal_output_verbose(10, mca_coll_base_output, "In hierarch barrier_intra");
return comm->c_coll_basic_module->coll_barrier(comm);
return MPI_SUCCESS;
}

Просмотреть файл

@ -20,6 +20,7 @@
#include "mpi.h"
#include "ompi/include/constants.h"
#include "opal/util/output.h"
#include "communicator/communicator.h"
#include "mca/coll/coll.h"
#include "mca/coll/base/base.h"
#include "mca/coll/base/coll_tags.h"
@ -28,28 +29,12 @@
/*
* bcast_intra
*
* Function: - broadcast using O(N) algorithm
* Function: - broadcast using hierarchical algorithm
* Accepts: - same arguments as MPI_Bcast()
* Returns: - MPI_SUCCESS or error code
*/
static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer,
int count,
ompi_datatype_t * datatype,
int root,
ompi_communicator_t * comm,
int segsize,
struct mca_coll_hierarch_topo *topo);
static int mca_coll_hierarch_intra_bcast_setup_topo (int count,
ompi_datatype_t *datatype,
int root,
struct mca_coll_base_comm_t *data,
int *segsize);
static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data );
int mca_coll_hierarch_bcast_intra(void *buff,
@ -60,44 +45,25 @@ int mca_coll_hierarch_bcast_intra(void *buff,
{
struct mca_coll_base_comm_t *data=NULL;
struct ompi_communicator_t *llcomm=NULL;
int lleader_of_root, lleader_replaced_by_root=0;
int rank, ret, lroot;
int segsize;
struct ompi_communicator_t *lcomm=NULL;
int lleader;
int rank, ret, llroot;
rank = ompi_comm_rank ( comm );
data = comm->c_coll_selected_data;
llcomm = data->hier_llcomm;
lcomm = data->hier_lcomm;
/* Determine whether
a) we have the same local leader like the root of this operation
b) the root and the local leader are the identical
If a) is true and b) not, we will replace the local leader for this
subgroup by the root
/* This function returns the local leader communicator
which *always* contains the root of this operation.
This might involve creating a new communicator. This is
also the reason, that *every* process in comm has to call
this function
*/
mca_coll_hierarch_get_lleader (root, data, &lleader_of_root);
if ( (lleader_of_root == data->hier_my_lleader) && (lleader_of_root != root )) {
lleader_replaced_by_root = 1;
}
llcomm = mca_coll_hierarch_get_llcomm ( root, data, &llroot);
/* Bcast on the upper level among the local leaders */
if ( rank == root || ( data->hier_am_lleader && !lleader_replaced_by_root) ) {
/* this functions sets up the topology used in the segmented
bcast afterwards and determines the segment size. */
ret = mca_coll_hierarch_intra_bcast_setup_topo (count, datatype, root,
data, &segsize);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
/* ok, do now the actual bcast. Hopefully, this routine will come
out of Jelena's collective module in the end. For the moment,
I've implemented it myself
*/
ret = mca_coll_hierarch_intra_segmented_bcast (buff, count,
datatype, root,
comm, segsize,
&(data->hier_topo));
if ( MPI_UNDEFINED == llroot ) {
llcomm->c_coll.coll_bcast(buff, count, datatype, llroot, llcomm);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
@ -106,271 +72,11 @@ int mca_coll_hierarch_bcast_intra(void *buff,
/* once the local leaders got the data from the root, they can distribute
it to the processes in their local, low-leve communicator.
*/
if ( MPI_COMM_NULL != llcomm ) {
if ( lleader_replaced_by_root ) {
mca_coll_hierarch_map_rank(root, data, &lroot);
ret = llcomm->c_coll.coll_bcast(buff, count, datatype, lroot,
llcomm);
}
else {
/* Assumption: the rank of the local leader on llcomm is always 0 */
ret = llcomm->c_coll.coll_bcast(buff, count, datatype, 0, llcomm );
}
mca_coll_hierarch_get_lleader (root, data, &lleader);
ret = lcomm->c_coll.coll_bcast(buff, count, datatype, lleader, lcomm );
}
return ret;
}
/*
* This is the mother of all segmented bcast algorithms of any type.
* Due to the general structure of the topo argument, you can use this function
* for any type of algorith - it just depends on the settings of topo.
*
* The implementation is strongly leaning on the implementation in FT-MPI.
*/
static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer,
int count,
ompi_datatype_t * datatype,
int root,
ompi_communicator_t * comm,
int segsize,
struct mca_coll_hierarch_topo *topo)
{
int err=0, i, j;
int size, rank;
int segcount; /* Number of elements sent with each segment */
int num_segments; /* Number of segmenets */
int recvcount; /* the same like segcount, except for the last segment */
int typelng, realsegsize;
char *tmpbuf;
long rlb, ext;
ompi_request_t ** recv_request= NULL;
size = ompi_comm_size ( comm );
rank = ompi_comm_rank ( comm );
/* ------------------------------------------- */
/* special case for size == 1 and 2 */
if (size == 1) {
return OMPI_SUCCESS;
}
if (size == 2) {
if (rank == root) {
err = mca_pml.pml_send(buffer, count, datatype, (rank+1)%2,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm );
if ( OMPI_SUCCESS != err ) {
return err;
}
} else {
err = mca_pml.pml_recv(buffer, count, datatype, root,
MCA_COLL_BASE_TAG_BCAST, comm,
MPI_STATUS_IGNORE);
if ( OMPI_SUCCESS != err) {
return err;
}
}
return OMPI_SUCCESS;
}
/* end special case for size == 1 and 2 */
tmpbuf = (char *) buffer;
/* -------------------------------------------------- */
/* Determine number of segments and number of elements
sent per operation */
err = ompi_ddt_type_size( datatype, &typelng);
if ( OMPI_SUCCESS != err) {
return ( err );
}
if ( segsize > 0 ) {
segcount = segsize/typelng;
num_segments = count/segcount;
if (0 != (count % segcount)) {
num_segments++;
}
}
else {
segcount = count;
num_segments = 1;
}
/* Determine real segment size = segcount * extent */
err = ompi_ddt_get_extent( datatype, &rlb, &ext );
if ( OMPI_SUCCESS != err) {
return ( err );
}
realsegsize = segcount*ext;
/* ----------------------------------------------------- */
/* Post Irecv if not root-node */
if (rank != root) {
/* has a parent. need to receive before sending */
if ( num_segments > 2 * size ) {
recv_request = (MPI_Request*)malloc ( sizeof(ompi_request_t *)*num_segments );
}
else {
recv_request = comm->c_coll_selected_data->hier_reqs;
}
for( i = 0; i < num_segments; i++) {
if ( i == (num_segments -1) ) {
recvcount = count - (segcount * i);
}
else {
recvcount = segcount;
}
err = mca_pml.pml_irecv(tmpbuf+i*realsegsize, recvcount, datatype,
topo->topo_prev, MCA_COLL_BASE_TAG_BCAST,
comm, &recv_request[i]);
if ( OMPI_SUCCESS != err ) {
return ( err );
}
}
}
/* ---------------------------------------------- */
/* If leaf node, just finish the receive */
if (topo->topo_nextsize == 0) {
if(recv_request != NULL) {
err = ompi_request_wait_all (num_segments, recv_request, MPI_STATUSES_IGNORE);
if ( OMPI_SUCCESS != err ) {
return ( err );
}
}
}
else {
/* ------------------------------------------ */
/* root or intermediate node */
for( i = 0; i < num_segments; i++) {
if (rank != root) {
/* intermediate nodes have to wait for the completion of
the corresponding receive */
err = ompi_request_wait_all(1, &recv_request[i], MPI_STATUS_IGNORE);
if ( OMPI_SUCCESS != err ) {
return ( err );
}
}
for ( j = 0; j < topo->topo_nextsize; j++) {
if ( i == ( num_segments - 1 )) {
recvcount = count - ( segcount * i);
}
else {
recvcount = segcount;
}
err = mca_pml.pml_send(tmpbuf+i*realsegsize, recvcount,
datatype, topo->topo_next[j],
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD, comm );
if( OMPI_SUCCESS != err ) {
return ( err );
}
} /* for ( j = 0; j < topo_nextsize; j++) */
} /* for ( i = 0; i < num_segments; i++) */
}
if ( num_segments > 2 * size ) {
if(recv_request != NULL) {
free(recv_request);
}
}
return OMPI_SUCCESS;
}
/*
* This routine does the magic to determine, which topology (bmtree, linear, chain etc)
* would perform best in this scenario. At the moment, we just do bmtree.
*
* The implementation is once again strongly related to the version in FT-MPI.
*/
static int mca_coll_hierarch_intra_bcast_setup_topo (int count,
ompi_datatype_t *datatype,
int root,
struct mca_coll_base_comm_t *data,
int *segsize)
{
/* without spending time on that issues, I set for the moment segsize to 32k. */
*segsize = 32768;
/* without spending time on that issue, I set the topology to a binomial tree */
setup_topo_bmtree ( root, data );
return OMPI_SUCCESS;
}
static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data )
{
/* This implementation is based on the closest first bmtree algorithms
in FT-MPI implemnented by George/Jelena, has however a couple of
significant modifications:
- we are not having a contiguous list of participating processes,
but a list containing the ranks of the participating processes.
*/
int childs = 0;
int rank, size, mask=1;
int index, remote, found;
int rootpos;
struct mca_coll_hierarch_topo *topo=&(data->hier_topo);
if (found) {
size = data->hier_num_lleaders;
}
else {
size = data->hier_num_lleaders + 1;
data->hier_lleaders[rootpos] = root;
}
rank = data->hier_my_lleader;
/* allocate the array of childprocesses, if not yet done */
if ( NULL == topo->topo_next && 0 == topo->topo_maxsize ) {
topo->topo_next = (int *) malloc (data->hier_num_lleaders+1 * sizeof(int));
if ( NULL != topo->topo_next ) {
return;
}
topo->topo_maxsize=data->hier_num_lleaders+1;
}
index = rank - rootpos;
if( index < 0 ) index += size;
while( mask <= index ) mask <<= 1;
/* Determine the rank of my father */
if( rootpos == rank ) {
topo->topo_prev = root;
}
else {
remote = (index ^ (mask >> 1)) + rootpos;
if( remote >= size ) {
remote -= size;
}
topo->topo_prev = data->hier_lleaders[remote];
}
/* And now let's fill my childs */
while( mask < size ) {
remote = (index ^ mask);
if( remote >= size ) break;
remote += rootpos;
if( remote >= size ) remote -= size;
topo->topo_next[childs] = data->hier_lleaders[remote];
mask <<= 1;
childs++;
}
topo->topo_nextsize = childs;
return;
}
#endif

Просмотреть файл

@ -22,11 +22,9 @@
#include "ompi_config.h"
#include "coll_hierarch.h"
#include "coll-hierarch-version.h"
#include "mpi.h"
#include "mca/coll/coll.h"
#include "coll_hierarch.h"
/*
* Public string showing the coll ompi_hierarch component version number
@ -40,8 +38,7 @@ const char *mca_coll_hierarch_component_version_string =
int mca_coll_hierarch_priority_param = -1;
int mca_coll_hierarch_verbose_param = -1;
int mca_coll_hierarch_verbose = 0;
int mca_coll_hierarch_walk_through_list_param=-1;
int mca_coll_hierarch_use_next_param=-1;
int mca_coll_hierarch_use_rdma_param=-1;
/*
@ -82,12 +79,10 @@ const mca_coll_base_component_1_0_0_t mca_coll_hierarch_component = {
{
/* Whether the component is checkpointable or not */
true
},
/* Initialization / querying functions */
mca_coll_hierarch_init_query,
mca_coll_hierarch_comm_query,
mca_coll_hierarch_comm_unquery
@ -103,10 +98,8 @@ static int hierarch_open(void)
mca_coll_hierarch_verbose_param =
mca_base_param_register_int("coll", "hierarch", "verbose", NULL,
mca_coll_hierarch_verbose);
mca_coll_hierarch_walk_through_list_param =
mca_base_param_register_int("coll", "hierarch", "walk_through_list", NULL, 0);
mca_coll_hierarch_use_next_param =
mca_base_param_register_int("coll", "hierarch", "use_next", NULL, 0);
mca_base_param_register_int("coll", "hierarch", "use_rdma", NULL, 0);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -27,7 +27,6 @@
#include "coll_hierarch.h"
#ifdef SIMPLE_HIERARCH
/*
* reduce_intra
*
@ -40,102 +39,5 @@ int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count,
struct ompi_op_t *op,
int root, struct ompi_communicator_t *comm)
{
struct mca_coll_base_comm_t *data=NULL;
struct ompi_communicator_t *llcomm=NULL;
long true_lb, true_extent, lb, extent;
char *free_buffer = NULL;
char *pml_buffer = NULL;
int i, rank, ret;
rank = ompi_comm_rank ( comm );
data = comm->c_coll_selected_data;
llcomm = data->hier_llcomm;
/*
* collect the data from the low-level communicators. Result will be stored
* on the local leaders.
*/
if ( MPI_COMM_NULL != llcomm ) {
ret = llcomm->c_coll.coll_reduce(sbuf, rbuf, count, dtype, op,
data->hier_my_lleader, llcomm );
}
/* trivial linear reduction receiving the data from all local leaders.
need something significantly better */
if ( rank == root ) {
/* Root receives and reduces messages */
ompi_ddt_get_extent(dtype, &lb, &extent);
ompi_ddt_get_true_extent(dtype, &true_lb, &true_extent);
free_buffer = malloc(true_extent + (count - 1) * extent);
if (NULL == free_buffer) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
pml_buffer = free_buffer - lb;
if ( !data->hier_am_lleader ) {
/* Initialize the receive buffer. */
ret = mca_pml.pml_recv(rbuf, count, dtype, data->hier_lleader[0],
MCA_COLL_BASE_TAG_REDUCE, comm,
MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) {
goto exit;
}
}
/* Loop receiving and calling reduction function (C or Fortran). */
for (i = 1; i < data->hier_num_lleaders; i++) {
if ( data->hier_lleader[i] == rank ) {
continue;
}
ret = mca_pml.pml_recv(pml_buffer, count, dtype, data->hier_lleader[i],
MCA_COLL_BASE_TAG_REDUCE, comm,
MPI_STATUS_IGNORE);
if (MPI_SUCCESS != ret) {
goto exit;
}
/* Perform the reduction */
ompi_op_reduce(op, pml_buffer, rbuf, count, dtype);
}
}
else if ( data->hier_am_lleader ) {
if ( MPI_COMM_NULL != llcomm ) {
ret = mca_pml.pml_send ( rbuf, count, dtype, root,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_STANDARD,
comm);
}
else {
ret = mca_pml.pml_send ( sbuf, count, dtype, root,
MCA_COLL_BASE_TAG_REDUCE,
MCA_PML_BASE_SEND_STANDARD,
comm);
}
}
exit:
if ( NULL != free_buffer ) {
free ( free_buffer);
}
return ret;
return OMPI_SUCCESS;
}
#else
int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
int root, struct ompi_communicator_t *comm)
{
opal_output_verbose(10, mca_coll_base_output, "In hierarch reduce_intra");
return comm->c_coll_basic_module->coll_reduce(sbuf, rbuf, count, dtype,
op, root, comm);
}
#endif