Checkpoint:
- update the hierarch stuff to use btl's instead of ptl's - start the new logic regarding how to handle local leader communicators This commit was SVN r7691.
Этот коммит содержится в:
родитель
9a1db9abba
Коммит
b42d4ac780
@ -21,19 +21,23 @@
|
||||
|
||||
#include "mpi.h"
|
||||
#include "communicator/communicator.h"
|
||||
#include "proc/proc.h"
|
||||
#include "mca/coll/coll.h"
|
||||
#include "mca/coll/base/base.h"
|
||||
#include "coll_hierarch.h"
|
||||
|
||||
#include "mca/ptl/ptl.h"
|
||||
#include "mca/pml/teg/src/pml_teg_proc.h"
|
||||
#include "mca/pml/teg/src/pml_teg_ptl.h"
|
||||
#include "class/ompi_bitmap.h"
|
||||
#include "mca/bml/bml.h"
|
||||
#include "mca/bml/base/base.h"
|
||||
#include "mca/pml/pml.h"
|
||||
#include "mca/btl/btl.h"
|
||||
|
||||
|
||||
/* local functions and data */
|
||||
#define HIER_MAXPROTOCOL 7
|
||||
static int mca_coll_hierarch_max_protocol=HIER_MAXPROTOCOL;
|
||||
|
||||
static char hier_prot[HIER_MAXPROTOCOL][5]={"0","tcp","ib","gm","mx","elan4","sm"};
|
||||
static char hier_prot[HIER_MAXPROTOCOL][6]={"0","tcp","gm","mx","mvapi","openib","sm"};
|
||||
|
||||
static void mca_coll_hierarch_checkfor_component (struct ompi_communicator_t *comm,
|
||||
char *component_name, int *key,
|
||||
@ -113,7 +117,6 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority,
|
||||
|
||||
/* This module only works for intra-communicators at the moment */
|
||||
if ( OMPI_COMM_IS_INTER(comm) ) {
|
||||
*priority = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -190,13 +193,11 @@ mca_coll_hierarch_comm_query(struct ompi_communicator_t *comm, int *priority,
|
||||
const struct mca_coll_base_module_1_0_0_t *
|
||||
mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
|
||||
{
|
||||
int color, ncount;
|
||||
int *colorarr=NULL, *llr=NULL;
|
||||
int color;
|
||||
int *llr=NULL;
|
||||
int size, rank, ret=OMPI_SUCCESS;
|
||||
int i, j, c, level;
|
||||
int found;
|
||||
|
||||
struct ompi_communicator_t *llcomm=NULL;
|
||||
struct ompi_communicator_t *lcomm=NULL;
|
||||
struct mca_coll_base_comm_t *data=NULL;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
@ -207,13 +208,13 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
|
||||
|
||||
/* Generate the subcommunicator based on the color returned by
|
||||
the previous function. */
|
||||
ret = ompi_comm_split ( comm, color, rank, &llcomm, 0 );
|
||||
ret = ompi_comm_split ( comm, color, rank, &lcomm, 0 );
|
||||
if ( OMPI_SUCCESS != ret ) {
|
||||
goto exit;
|
||||
}
|
||||
|
||||
data->hier_comm = comm;
|
||||
data->hier_llcomm = llcomm;
|
||||
data->hier_lcomm = lcomm;
|
||||
data->hier_num_reqs = 2 * size;
|
||||
data->hier_reqs = (ompi_request_t **) malloc (sizeof(ompi_request_t)*size*2);
|
||||
if ( NULL == data->hier_reqs ) {
|
||||
@ -238,6 +239,21 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
|
||||
data->hier_am_lleader = 1; /*true */
|
||||
}
|
||||
|
||||
/* Generate the lleader communicator assuming that all lleaders are the first
|
||||
process in the list of processes with the same color. A function generating
|
||||
other lleader-comms will follow soon. */
|
||||
ompi_comm_split ( comm, data->hier_am_lleader, rank, &llcomm, 0);
|
||||
if ( OMPI_SUCCESS != ret ) {
|
||||
goto exit;
|
||||
}
|
||||
data->hier_llcomm = (struct ompi_communicator_t *)malloc (HIER_DEFAULT_NUM_LLCOMM *
|
||||
sizeof(struct ompi_communicator_t *));
|
||||
if ( NULL == data->hier_llcomm ) {
|
||||
goto exit;
|
||||
}
|
||||
data->hier_num_llcomm = HIER_DEFAULT_NUM_LLCOMM;
|
||||
data->hier_llcomm[0] = llcomm;
|
||||
|
||||
|
||||
/* This is the point where I will introduce later on a function trying to
|
||||
compact the colorarr array. Not done at the moment */
|
||||
@ -248,7 +264,7 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
|
||||
free (llr);
|
||||
}
|
||||
if ( OMPI_SUCCESS != ret ) {
|
||||
ompi_comm_free ( &llcomm );
|
||||
ompi_comm_free ( &lcomm );
|
||||
if ( NULL != data ) {
|
||||
if ( NULL != data->hier_reqs ) {
|
||||
free ( data->hier_reqs);
|
||||
@ -274,19 +290,16 @@ mca_coll_hierarch_module_init(struct ompi_communicator_t *comm)
|
||||
*/
|
||||
int mca_coll_hierarch_module_finalize(struct ompi_communicator_t *comm)
|
||||
{
|
||||
struct ompi_communicator_t *llcomm=NULL;
|
||||
struct ompi_communicator_t *lcomm=NULL;
|
||||
struct mca_coll_base_comm_t *data=NULL;
|
||||
|
||||
data = comm->c_coll_selected_data;
|
||||
llcomm = data->hier_llcomm;
|
||||
lcomm = data->hier_lcomm;
|
||||
|
||||
ompi_comm_free (&llcomm);
|
||||
ompi_comm_free (&lcomm);
|
||||
free ( data->hier_reqs );
|
||||
free ( data->hier_lleaders );
|
||||
free ( data->hier_colorarr );
|
||||
if ( NULL != data->hier_topo.topo_next ) {
|
||||
free (data->hier_topo.topo_next);
|
||||
}
|
||||
free ( data );
|
||||
|
||||
comm->c_coll_selected_data = NULL;
|
||||
@ -316,86 +329,78 @@ mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm,
|
||||
int *key,
|
||||
int *ncount )
|
||||
{
|
||||
mca_pml_proc_t *proc=NULL;
|
||||
mca_ptl_proc_t *ptl_proc=NULL;
|
||||
mca_ptl_base_module_t *ptl_module=NULL;
|
||||
mca_ptl_base_component_t *ptr=NULL;
|
||||
ompi_bitmap_t reachable;
|
||||
ompi_proc_t **procs=NULL;
|
||||
struct mca_bml_base_endpoint_t **bml_endpoints=NULL;
|
||||
struct mca_bml_base_btl_array_t *bml_btl_array=NULL;
|
||||
mca_bml_base_btl_t *bml_btl=NULL;
|
||||
mca_btl_base_component_t *btl=NULL;
|
||||
|
||||
int i, j, size;
|
||||
int i, size, rc;
|
||||
|
||||
int counter=0;
|
||||
int firstproc=999999;
|
||||
int rank = -1;
|
||||
|
||||
int listsize=1;
|
||||
int use_next, walk_through_list;
|
||||
int use_rdma=0;
|
||||
|
||||
/* default values in case an error occurs */
|
||||
*ncount=0;
|
||||
*key=MPI_UNDEFINED;
|
||||
|
||||
/* Shall we just check the first element in the ptl list ? */
|
||||
if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_walk_through_list_param,
|
||||
&walk_through_list)) {
|
||||
/* Shall we check the the rdma list instead of send-list in the endpoint-structure? */
|
||||
/* if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_rdma_param,
|
||||
&use_rdma)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Shall we use the first_elem list or the next_elem list? */
|
||||
if (OMPI_SUCCESS != mca_base_param_lookup_int(mca_coll_hierarch_use_next_param,
|
||||
&use_next)) {
|
||||
return;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
size = ompi_comm_size ( comm );
|
||||
rank = ompi_comm_rank ( comm );
|
||||
|
||||
OBJ_CONSTRUCT(&reachable, ompi_bitmap_t);
|
||||
rc = ompi_bitmap_init(&reachable, size);
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
return;
|
||||
}
|
||||
|
||||
rc = mca_bml.bml_add_procs (
|
||||
size,
|
||||
procs,
|
||||
bml_endpoints,
|
||||
&reachable
|
||||
);
|
||||
|
||||
if(OMPI_SUCCESS != rc) {
|
||||
return;
|
||||
}
|
||||
|
||||
for ( i=0; i<size; i++ ) {
|
||||
if ( rank == i ) {
|
||||
/* skip myself */
|
||||
continue;
|
||||
}
|
||||
|
||||
proc = mca_pml_teg_proc_lookup_remote (comm, i);
|
||||
if ( use_next ) {
|
||||
ptl_proc=mca_ptl_array_get_next(&proc->proc_ptl_first);
|
||||
if ( walk_through_list ) {
|
||||
/*
|
||||
* Walking through the listmight be unecessary. Assumption is,
|
||||
* that if we did not register this as the first protocol, there is
|
||||
* a protocol which is faster than this one.
|
||||
*
|
||||
* Example: on a IB-cluster with dual processor nodes, I can talk
|
||||
* to all procs with IB, however the process on my node will
|
||||
* hopefully have sm registered as its first protocoll.
|
||||
*/
|
||||
|
||||
listsize = mca_ptl_array_get_size(&proc->proc_ptl_first);
|
||||
}
|
||||
|
||||
if ( use_rdma ) {
|
||||
bml_btl_array = &(bml_endpoints[i]->btl_rdma);
|
||||
}
|
||||
else {
|
||||
ptl_proc=mca_ptl_array_get_next(&proc->proc_ptl_next);
|
||||
if ( walk_through_list ) {
|
||||
listsize = mca_ptl_array_get_size(&proc->proc_ptl_next);
|
||||
}
|
||||
bml_btl_array = &(bml_endpoints[i]->btl_send);
|
||||
}
|
||||
bml_btl = mca_bml_base_btl_array_get_index ( bml_btl_array, 0 );
|
||||
btl = bml_btl->btl->btl_component;
|
||||
|
||||
for ( j=0; j<listsize;j++) {
|
||||
ptl_module = ptl_proc->ptl;
|
||||
ptr = ptl_module->ptl_component;
|
||||
/* sanity check */
|
||||
if ( strcmp(btl->btl_version.mca_type_name,"btl") ) {
|
||||
printf("Oops, got the wrong component! type_name = %s\n",
|
||||
btl->btl_version.mca_type_name );
|
||||
}
|
||||
|
||||
/* sanity check */
|
||||
if ( strcmp(ptr->ptlm_version.mca_type_name,"ptl") ) {
|
||||
printf("Oops, got the wrong component! type_name = %s\n",
|
||||
ptr->ptlm_version.mca_type_name );
|
||||
}
|
||||
/* check for the required component */
|
||||
if (! strcmp (btl->btl_version.mca_component_name, component_name)){
|
||||
counter++;
|
||||
|
||||
/* check for the required component */
|
||||
if (! strcmp (ptr->ptlm_version.mca_component_name, component_name)){
|
||||
counter++;
|
||||
|
||||
if (i<firstproc ) {
|
||||
firstproc = i;
|
||||
}
|
||||
if (i<firstproc ) {
|
||||
firstproc = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -23,7 +23,6 @@
|
||||
#include "mca/mca.h"
|
||||
#include "mca/coll/coll.h"
|
||||
#include "request/request.h"
|
||||
#include "mca/pml/pml.h"
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
@ -40,33 +39,32 @@ extern int mca_coll_hierarch_verbose;
|
||||
extern int mca_coll_hierarch_walk_through_list_param;
|
||||
extern int mca_coll_hierarch_use_next_param;
|
||||
|
||||
|
||||
#define HIER_DEFAULT_NUM_LLCOMM 5
|
||||
/*
|
||||
* Data structure for attaching data to the communicator
|
||||
*/
|
||||
|
||||
struct mca_coll_hierarch_topo {
|
||||
int topo_root;
|
||||
int topo_prev;
|
||||
int topo_nextsize;
|
||||
int topo_maxsize;
|
||||
int *topo_next;
|
||||
struct mca_coll_hierarch_llead {
|
||||
struct ompi_communicator_t *hier_llcomm; /* local leader communicator */
|
||||
int hier_num_lleaders; /* number of local leaders */
|
||||
int *hier_lleaders; /* list of local leaders, ranks in comm */
|
||||
int hier_my_lleader_on_lcomm; /* rank of my lleader in llcomm */
|
||||
int hier_am_lleader; /* am I an lleader? */
|
||||
int hier_my_lleader; /* pos. of my lleader in hier_lleaders */
|
||||
};
|
||||
|
||||
struct mca_coll_base_comm_t {
|
||||
struct ompi_communicator_t *hier_comm; /* link back to the attached comm */
|
||||
struct ompi_communicator_t *hier_llcomm; /* low level communicator */
|
||||
int hier_level; /* level in the hierarchy. just debugging */
|
||||
int hier_num_lleaders; /* number of local leaders */
|
||||
int *hier_lleaders; /* list of local leaders, ranks in comm */
|
||||
int hier_my_lleader; /* pos. of my lleader in hier_lleaders */
|
||||
int hier_my_lleader_on_llcomm; /* rank of my lleader in llcomm */
|
||||
int hier_am_lleader; /* am I an lleader? */
|
||||
int hier_num_reqs; /* num. of requests */
|
||||
ompi_request_t **hier_reqs; /* list of requests */
|
||||
int hier_type_colorarr; /* format in which the colorarr is stored */
|
||||
int hier_num_colorarr; /* size of the colorarr array */
|
||||
int* hier_colorarr; /* array containing the color of all procs */
|
||||
struct mca_coll_hierarch_topo hier_topo; /* topology used in the coll ops */
|
||||
struct ompi_communicator_t *hier_comm; /* link back to the attached comm */
|
||||
struct ompi_communicator_t *hier_lcomm; /* low level communicator */
|
||||
struct mca_coll_hierarch_llead *hier_llead; /* structure for lleader communicator */
|
||||
int hier_num_llead; /* number of llead structs */
|
||||
int hier_level; /* level in the hierarchy. just debugging */
|
||||
int hier_num_reqs; /* num. of requests */
|
||||
ompi_request_t **hier_reqs; /* list of requests */
|
||||
int hier_type_colorarr; /* format in which the colorarr is stored */
|
||||
int hier_num_colorarr; /* size of the colorarr array */
|
||||
int* hier_colorarr; /* array containing the color of all procs */
|
||||
};
|
||||
|
||||
/* These are various modes how the colorarr is stored. The reason
|
||||
@ -163,6 +161,13 @@ static inline void mca_coll_hierarch_get_all_lleaders ( int size, int *carr, int
|
||||
return;
|
||||
}
|
||||
|
||||
static inline struct ompi_communicator_t* mca_coll_hierarch_get_llcomm ( int rank,
|
||||
struct mca_coll_base_comm_t *data,
|
||||
int* lleader )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void mca_coll_hierarch_get_lleader (int rank, struct mca_coll_base_comm_t *data,
|
||||
int* lleader )
|
||||
{
|
||||
|
@ -35,7 +35,7 @@
|
||||
int mca_coll_hierarch_barrier_intra(struct ompi_communicator_t *comm)
|
||||
{
|
||||
opal_output_verbose(10, mca_coll_base_output, "In hierarch barrier_intra");
|
||||
return comm->c_coll_basic_module->coll_barrier(comm);
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "mpi.h"
|
||||
#include "ompi/include/constants.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "communicator/communicator.h"
|
||||
#include "mca/coll/coll.h"
|
||||
#include "mca/coll/base/base.h"
|
||||
#include "mca/coll/base/coll_tags.h"
|
||||
@ -28,28 +29,12 @@
|
||||
/*
|
||||
* bcast_intra
|
||||
*
|
||||
* Function: - broadcast using O(N) algorithm
|
||||
* Function: - broadcast using hierarchical algorithm
|
||||
* Accepts: - same arguments as MPI_Bcast()
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
|
||||
|
||||
static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer,
|
||||
int count,
|
||||
ompi_datatype_t * datatype,
|
||||
int root,
|
||||
ompi_communicator_t * comm,
|
||||
int segsize,
|
||||
struct mca_coll_hierarch_topo *topo);
|
||||
|
||||
static int mca_coll_hierarch_intra_bcast_setup_topo (int count,
|
||||
ompi_datatype_t *datatype,
|
||||
int root,
|
||||
struct mca_coll_base_comm_t *data,
|
||||
int *segsize);
|
||||
static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data );
|
||||
|
||||
|
||||
|
||||
|
||||
int mca_coll_hierarch_bcast_intra(void *buff,
|
||||
@ -60,44 +45,25 @@ int mca_coll_hierarch_bcast_intra(void *buff,
|
||||
{
|
||||
struct mca_coll_base_comm_t *data=NULL;
|
||||
struct ompi_communicator_t *llcomm=NULL;
|
||||
int lleader_of_root, lleader_replaced_by_root=0;
|
||||
int rank, ret, lroot;
|
||||
int segsize;
|
||||
struct ompi_communicator_t *lcomm=NULL;
|
||||
int lleader;
|
||||
int rank, ret, llroot;
|
||||
|
||||
rank = ompi_comm_rank ( comm );
|
||||
data = comm->c_coll_selected_data;
|
||||
llcomm = data->hier_llcomm;
|
||||
lcomm = data->hier_lcomm;
|
||||
|
||||
/* Determine whether
|
||||
a) we have the same local leader like the root of this operation
|
||||
b) the root and the local leader are the identical
|
||||
|
||||
If a) is true and b) not, we will replace the local leader for this
|
||||
subgroup by the root
|
||||
/* This function returns the local leader communicator
|
||||
which *always* contains the root of this operation.
|
||||
This might involve creating a new communicator. This is
|
||||
also the reason, that *every* process in comm has to call
|
||||
this function
|
||||
*/
|
||||
|
||||
mca_coll_hierarch_get_lleader (root, data, &lleader_of_root);
|
||||
if ( (lleader_of_root == data->hier_my_lleader) && (lleader_of_root != root )) {
|
||||
lleader_replaced_by_root = 1;
|
||||
}
|
||||
llcomm = mca_coll_hierarch_get_llcomm ( root, data, &llroot);
|
||||
|
||||
/* Bcast on the upper level among the local leaders */
|
||||
if ( rank == root || ( data->hier_am_lleader && !lleader_replaced_by_root) ) {
|
||||
/* this functions sets up the topology used in the segmented
|
||||
bcast afterwards and determines the segment size. */
|
||||
ret = mca_coll_hierarch_intra_bcast_setup_topo (count, datatype, root,
|
||||
data, &segsize);
|
||||
if ( OMPI_SUCCESS != ret ) {
|
||||
return ret;
|
||||
}
|
||||
/* ok, do now the actual bcast. Hopefully, this routine will come
|
||||
out of Jelena's collective module in the end. For the moment,
|
||||
I've implemented it myself
|
||||
*/
|
||||
ret = mca_coll_hierarch_intra_segmented_bcast (buff, count,
|
||||
datatype, root,
|
||||
comm, segsize,
|
||||
&(data->hier_topo));
|
||||
if ( MPI_UNDEFINED == llroot ) {
|
||||
llcomm->c_coll.coll_bcast(buff, count, datatype, llroot, llcomm);
|
||||
if ( OMPI_SUCCESS != ret ) {
|
||||
return ret;
|
||||
}
|
||||
@ -106,271 +72,11 @@ int mca_coll_hierarch_bcast_intra(void *buff,
|
||||
/* once the local leaders got the data from the root, they can distribute
|
||||
it to the processes in their local, low-leve communicator.
|
||||
*/
|
||||
|
||||
if ( MPI_COMM_NULL != llcomm ) {
|
||||
if ( lleader_replaced_by_root ) {
|
||||
mca_coll_hierarch_map_rank(root, data, &lroot);
|
||||
ret = llcomm->c_coll.coll_bcast(buff, count, datatype, lroot,
|
||||
llcomm);
|
||||
}
|
||||
else {
|
||||
/* Assumption: the rank of the local leader on llcomm is always 0 */
|
||||
ret = llcomm->c_coll.coll_bcast(buff, count, datatype, 0, llcomm );
|
||||
}
|
||||
mca_coll_hierarch_get_lleader (root, data, &lleader);
|
||||
ret = lcomm->c_coll.coll_bcast(buff, count, datatype, lleader, lcomm );
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* This is the mother of all segmented bcast algorithms of any type.
|
||||
* Due to the general structure of the topo argument, you can use this function
|
||||
* for any type of algorith - it just depends on the settings of topo.
|
||||
*
|
||||
* The implementation is strongly leaning on the implementation in FT-MPI.
|
||||
*/
|
||||
|
||||
static int mca_coll_hierarch_intra_segmented_bcast ( void* buffer,
|
||||
int count,
|
||||
ompi_datatype_t * datatype,
|
||||
int root,
|
||||
ompi_communicator_t * comm,
|
||||
int segsize,
|
||||
struct mca_coll_hierarch_topo *topo)
|
||||
{
|
||||
int err=0, i, j;
|
||||
int size, rank;
|
||||
int segcount; /* Number of elements sent with each segment */
|
||||
int num_segments; /* Number of segmenets */
|
||||
int recvcount; /* the same like segcount, except for the last segment */
|
||||
int typelng, realsegsize;
|
||||
char *tmpbuf;
|
||||
long rlb, ext;
|
||||
ompi_request_t ** recv_request= NULL;
|
||||
|
||||
size = ompi_comm_size ( comm );
|
||||
rank = ompi_comm_rank ( comm );
|
||||
|
||||
/* ------------------------------------------- */
|
||||
/* special case for size == 1 and 2 */
|
||||
if (size == 1) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
if (size == 2) {
|
||||
if (rank == root) {
|
||||
err = mca_pml.pml_send(buffer, count, datatype, (rank+1)%2,
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm );
|
||||
if ( OMPI_SUCCESS != err ) {
|
||||
return err;
|
||||
}
|
||||
} else {
|
||||
err = mca_pml.pml_recv(buffer, count, datatype, root,
|
||||
MCA_COLL_BASE_TAG_BCAST, comm,
|
||||
MPI_STATUS_IGNORE);
|
||||
if ( OMPI_SUCCESS != err) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
/* end special case for size == 1 and 2 */
|
||||
|
||||
|
||||
tmpbuf = (char *) buffer;
|
||||
/* -------------------------------------------------- */
|
||||
/* Determine number of segments and number of elements
|
||||
sent per operation */
|
||||
err = ompi_ddt_type_size( datatype, &typelng);
|
||||
if ( OMPI_SUCCESS != err) {
|
||||
return ( err );
|
||||
}
|
||||
|
||||
if ( segsize > 0 ) {
|
||||
segcount = segsize/typelng;
|
||||
num_segments = count/segcount;
|
||||
if (0 != (count % segcount)) {
|
||||
num_segments++;
|
||||
}
|
||||
}
|
||||
else {
|
||||
segcount = count;
|
||||
num_segments = 1;
|
||||
}
|
||||
|
||||
/* Determine real segment size = segcount * extent */
|
||||
err = ompi_ddt_get_extent( datatype, &rlb, &ext );
|
||||
if ( OMPI_SUCCESS != err) {
|
||||
return ( err );
|
||||
}
|
||||
realsegsize = segcount*ext;
|
||||
|
||||
/* ----------------------------------------------------- */
|
||||
/* Post Irecv if not root-node */
|
||||
if (rank != root) {
|
||||
/* has a parent. need to receive before sending */
|
||||
if ( num_segments > 2 * size ) {
|
||||
recv_request = (MPI_Request*)malloc ( sizeof(ompi_request_t *)*num_segments );
|
||||
}
|
||||
else {
|
||||
recv_request = comm->c_coll_selected_data->hier_reqs;
|
||||
}
|
||||
|
||||
for( i = 0; i < num_segments; i++) {
|
||||
if ( i == (num_segments -1) ) {
|
||||
recvcount = count - (segcount * i);
|
||||
}
|
||||
else {
|
||||
recvcount = segcount;
|
||||
}
|
||||
err = mca_pml.pml_irecv(tmpbuf+i*realsegsize, recvcount, datatype,
|
||||
topo->topo_prev, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, &recv_request[i]);
|
||||
if ( OMPI_SUCCESS != err ) {
|
||||
return ( err );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------- */
|
||||
/* If leaf node, just finish the receive */
|
||||
if (topo->topo_nextsize == 0) {
|
||||
if(recv_request != NULL) {
|
||||
err = ompi_request_wait_all (num_segments, recv_request, MPI_STATUSES_IGNORE);
|
||||
if ( OMPI_SUCCESS != err ) {
|
||||
return ( err );
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* ------------------------------------------ */
|
||||
/* root or intermediate node */
|
||||
for( i = 0; i < num_segments; i++) {
|
||||
if (rank != root) {
|
||||
/* intermediate nodes have to wait for the completion of
|
||||
the corresponding receive */
|
||||
err = ompi_request_wait_all(1, &recv_request[i], MPI_STATUS_IGNORE);
|
||||
if ( OMPI_SUCCESS != err ) {
|
||||
return ( err );
|
||||
}
|
||||
}
|
||||
for ( j = 0; j < topo->topo_nextsize; j++) {
|
||||
if ( i == ( num_segments - 1 )) {
|
||||
recvcount = count - ( segcount * i);
|
||||
}
|
||||
else {
|
||||
recvcount = segcount;
|
||||
}
|
||||
|
||||
err = mca_pml.pml_send(tmpbuf+i*realsegsize, recvcount,
|
||||
datatype, topo->topo_next[j],
|
||||
MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm );
|
||||
if( OMPI_SUCCESS != err ) {
|
||||
return ( err );
|
||||
}
|
||||
} /* for ( j = 0; j < topo_nextsize; j++) */
|
||||
} /* for ( i = 0; i < num_segments; i++) */
|
||||
}
|
||||
|
||||
if ( num_segments > 2 * size ) {
|
||||
if(recv_request != NULL) {
|
||||
free(recv_request);
|
||||
}
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This routine does the magic to determine, which topology (bmtree, linear, chain etc)
|
||||
* would perform best in this scenario. At the moment, we just do bmtree.
|
||||
*
|
||||
* The implementation is once again strongly related to the version in FT-MPI.
|
||||
*/
|
||||
static int mca_coll_hierarch_intra_bcast_setup_topo (int count,
|
||||
ompi_datatype_t *datatype,
|
||||
int root,
|
||||
struct mca_coll_base_comm_t *data,
|
||||
int *segsize)
|
||||
{
|
||||
/* without spending time on that issues, I set for the moment segsize to 32k. */
|
||||
*segsize = 32768;
|
||||
|
||||
/* without spending time on that issue, I set the topology to a binomial tree */
|
||||
setup_topo_bmtree ( root, data );
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static void setup_topo_bmtree ( int root, struct mca_coll_base_comm_t *data )
|
||||
{
|
||||
/* This implementation is based on the closest first bmtree algorithms
|
||||
in FT-MPI implemnented by George/Jelena, has however a couple of
|
||||
significant modifications:
|
||||
- we are not having a contiguous list of participating processes,
|
||||
but a list containing the ranks of the participating processes.
|
||||
*/
|
||||
|
||||
int childs = 0;
|
||||
int rank, size, mask=1;
|
||||
int index, remote, found;
|
||||
int rootpos;
|
||||
struct mca_coll_hierarch_topo *topo=&(data->hier_topo);
|
||||
|
||||
|
||||
|
||||
if (found) {
|
||||
size = data->hier_num_lleaders;
|
||||
}
|
||||
else {
|
||||
size = data->hier_num_lleaders + 1;
|
||||
data->hier_lleaders[rootpos] = root;
|
||||
}
|
||||
rank = data->hier_my_lleader;
|
||||
|
||||
/* allocate the array of childprocesses, if not yet done */
|
||||
if ( NULL == topo->topo_next && 0 == topo->topo_maxsize ) {
|
||||
topo->topo_next = (int *) malloc (data->hier_num_lleaders+1 * sizeof(int));
|
||||
if ( NULL != topo->topo_next ) {
|
||||
return;
|
||||
}
|
||||
topo->topo_maxsize=data->hier_num_lleaders+1;
|
||||
}
|
||||
|
||||
index = rank - rootpos;
|
||||
|
||||
if( index < 0 ) index += size;
|
||||
while( mask <= index ) mask <<= 1;
|
||||
|
||||
/* Determine the rank of my father */
|
||||
if( rootpos == rank ) {
|
||||
topo->topo_prev = root;
|
||||
}
|
||||
else {
|
||||
remote = (index ^ (mask >> 1)) + rootpos;
|
||||
if( remote >= size ) {
|
||||
remote -= size;
|
||||
}
|
||||
topo->topo_prev = data->hier_lleaders[remote];
|
||||
}
|
||||
|
||||
/* And now let's fill my childs */
|
||||
while( mask < size ) {
|
||||
remote = (index ^ mask);
|
||||
if( remote >= size ) break;
|
||||
remote += rootpos;
|
||||
if( remote >= size ) remote -= size;
|
||||
topo->topo_next[childs] = data->hier_lleaders[remote];
|
||||
mask <<= 1;
|
||||
childs++;
|
||||
}
|
||||
|
||||
topo->topo_nextsize = childs;
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -22,11 +22,9 @@
|
||||
|
||||
#include "ompi_config.h"
|
||||
#include "coll_hierarch.h"
|
||||
#include "coll-hierarch-version.h"
|
||||
|
||||
#include "mpi.h"
|
||||
#include "mca/coll/coll.h"
|
||||
#include "coll_hierarch.h"
|
||||
|
||||
/*
|
||||
* Public string showing the coll ompi_hierarch component version number
|
||||
@ -40,8 +38,7 @@ const char *mca_coll_hierarch_component_version_string =
|
||||
int mca_coll_hierarch_priority_param = -1;
|
||||
int mca_coll_hierarch_verbose_param = -1;
|
||||
int mca_coll_hierarch_verbose = 0;
|
||||
int mca_coll_hierarch_walk_through_list_param=-1;
|
||||
int mca_coll_hierarch_use_next_param=-1;
|
||||
int mca_coll_hierarch_use_rdma_param=-1;
|
||||
|
||||
|
||||
/*
|
||||
@ -82,12 +79,10 @@ const mca_coll_base_component_1_0_0_t mca_coll_hierarch_component = {
|
||||
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
|
||||
true
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
|
||||
mca_coll_hierarch_init_query,
|
||||
mca_coll_hierarch_comm_query,
|
||||
mca_coll_hierarch_comm_unquery
|
||||
@ -103,10 +98,8 @@ static int hierarch_open(void)
|
||||
mca_coll_hierarch_verbose_param =
|
||||
mca_base_param_register_int("coll", "hierarch", "verbose", NULL,
|
||||
mca_coll_hierarch_verbose);
|
||||
mca_coll_hierarch_walk_through_list_param =
|
||||
mca_base_param_register_int("coll", "hierarch", "walk_through_list", NULL, 0);
|
||||
mca_coll_hierarch_use_next_param =
|
||||
mca_base_param_register_int("coll", "hierarch", "use_next", NULL, 0);
|
||||
mca_base_param_register_int("coll", "hierarch", "use_rdma", NULL, 0);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -27,7 +27,6 @@
|
||||
#include "coll_hierarch.h"
|
||||
|
||||
|
||||
#ifdef SIMPLE_HIERARCH
|
||||
/*
|
||||
* reduce_intra
|
||||
*
|
||||
@ -40,102 +39,5 @@ int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_op_t *op,
|
||||
int root, struct ompi_communicator_t *comm)
|
||||
{
|
||||
struct mca_coll_base_comm_t *data=NULL;
|
||||
struct ompi_communicator_t *llcomm=NULL;
|
||||
long true_lb, true_extent, lb, extent;
|
||||
char *free_buffer = NULL;
|
||||
char *pml_buffer = NULL;
|
||||
int i, rank, ret;
|
||||
|
||||
rank = ompi_comm_rank ( comm );
|
||||
data = comm->c_coll_selected_data;
|
||||
llcomm = data->hier_llcomm;
|
||||
|
||||
|
||||
/*
|
||||
* collect the data from the low-level communicators. Result will be stored
|
||||
* on the local leaders.
|
||||
*/
|
||||
if ( MPI_COMM_NULL != llcomm ) {
|
||||
ret = llcomm->c_coll.coll_reduce(sbuf, rbuf, count, dtype, op,
|
||||
data->hier_my_lleader, llcomm );
|
||||
}
|
||||
|
||||
|
||||
/* trivial linear reduction receiving the data from all local leaders.
|
||||
need something significantly better */
|
||||
if ( rank == root ) {
|
||||
/* Root receives and reduces messages */
|
||||
ompi_ddt_get_extent(dtype, &lb, &extent);
|
||||
ompi_ddt_get_true_extent(dtype, &true_lb, &true_extent);
|
||||
|
||||
free_buffer = malloc(true_extent + (count - 1) * extent);
|
||||
if (NULL == free_buffer) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
pml_buffer = free_buffer - lb;
|
||||
|
||||
if ( !data->hier_am_lleader ) {
|
||||
/* Initialize the receive buffer. */
|
||||
ret = mca_pml.pml_recv(rbuf, count, dtype, data->hier_lleader[0],
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE);
|
||||
if (MPI_SUCCESS != ret) {
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
/* Loop receiving and calling reduction function (C or Fortran). */
|
||||
for (i = 1; i < data->hier_num_lleaders; i++) {
|
||||
if ( data->hier_lleader[i] == rank ) {
|
||||
continue;
|
||||
}
|
||||
ret = mca_pml.pml_recv(pml_buffer, count, dtype, data->hier_lleader[i],
|
||||
MCA_COLL_BASE_TAG_REDUCE, comm,
|
||||
MPI_STATUS_IGNORE);
|
||||
if (MPI_SUCCESS != ret) {
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* Perform the reduction */
|
||||
ompi_op_reduce(op, pml_buffer, rbuf, count, dtype);
|
||||
}
|
||||
}
|
||||
else if ( data->hier_am_lleader ) {
|
||||
if ( MPI_COMM_NULL != llcomm ) {
|
||||
ret = mca_pml.pml_send ( rbuf, count, dtype, root,
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm);
|
||||
}
|
||||
else {
|
||||
ret = mca_pml.pml_send ( sbuf, count, dtype, root,
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
MCA_PML_BASE_SEND_STANDARD,
|
||||
comm);
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
if ( NULL != free_buffer ) {
|
||||
free ( free_buffer);
|
||||
}
|
||||
|
||||
|
||||
return ret;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int mca_coll_hierarch_reduce_intra(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
int root, struct ompi_communicator_t *comm)
|
||||
{
|
||||
opal_output_verbose(10, mca_coll_base_output, "In hierarch reduce_intra");
|
||||
return comm->c_coll_basic_module->coll_reduce(sbuf, rbuf, count, dtype,
|
||||
op, root, comm);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user