1
1

adding four different algorithms for a hierarchical bcast which try to

generate an overlap between the different layers. Why four versions? Because
there is right now always the trade-off between using non-blocking operations
on a layer with a trivial, linear algorithm and using the more sophisticaed
algorithms in a blocking manner. 

- bcast_intra_seg used the bcast of lcomm and llcomm, similarly
  to original algorithm in hierarch. However, it can segment
  the message, such that we might get an overlap between the two
  layers. This overlap is based on the assumption, that a process
  might be done early with a bcast and can start the next one.
- bcast_intra_seg1: replaces the llcomm->bcast by isend/irecvs
  to increase the overlap, keeps the lcomm->bcast however
- bcast_intra_seg2: replaced lcomm->bcast by isend/irecvs
  to increase the overlap, keeps however llcomm->bcast
- bcast_intra_seg3: replaced both lcomm->bcast and llcomm->bcast
  by isend/irecvs

The code is lightly tested, more testing to follow right now.

This commit was SVN r19358.
Этот коммит содержится в:
Edgar Gabriel 2008-08-18 16:05:44 +00:00
родитель 19514f4df6
Коммит 7cbc4a4077
3 изменённых файлов: 698 добавлений и 23 удалений

Просмотреть файл

@ -45,6 +45,17 @@ extern int mca_coll_hierarch_verbose_param;
extern int mca_coll_hierarch_use_rdma_param;
extern int mca_coll_hierarch_ignore_sm_param;
extern int mca_coll_hierarch_detection_alg_param;
extern int mca_coll_hierarch_bcast_alg_param;
extern int mca_coll_hierarch_segsize_param;
#define COLL_HIERARCH_SEG_BCAST_ALG 1
#define COLL_HIERARCH_SEG1_BCAST_ALG 2
#define COLL_HIERARCH_SEG2_BCAST_ALG 3
#define COLL_HIERARCH_SEG3_BCAST_ALG 4
#define COLL_HIERARCH_BASIC_BCAST_ALG 5
#define HIER_DEFAULT_NUM_LLEAD 5
/*

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 University of Houston. All rights reserved.
* Copyright (c) 2007-2008 University of Houston. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -25,6 +25,9 @@
#include "orte/util/show_help.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/datatype/datatype.h"
#include "ompi/mca/pml/pml.h"
/*
* bcast_intra
@ -35,53 +38,699 @@
*/
static int mca_coll_hierarch_bcast_intra_seg (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize );
static int mca_coll_hierarch_bcast_intra_seg1 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize );
static int mca_coll_hierarch_bcast_intra_seg2 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize );
static int mca_coll_hierarch_bcast_intra_seg3 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize );
int mca_coll_hierarch_bcast_intra(void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int bcast_alg = mca_coll_hierarch_bcast_alg_param;
int segsize = mca_coll_hierarch_segsize_param;
int ret=OMPI_SUCCESS;
/* Here is a brief description on what we try to evaluate:
- bcast_intra_seg used the bcast of lcomm and llcomm, similarly
to original algorithm in hierarch. However, it can segment
the message, such that we might get an overlap between the two
layers. This overlap is based on the assumption, that a process
might be done early with a bcast and can start the next one.
- bcast_intra_seg1: replaces the llcomm->bcast by isend/irecvs
to increase the overlap, keeps the lcomm->bcast however
- bcast_intra_seg2: replaced lcomm->bcast by isend/irecvs
to increase the overlap, keeps however llcomm->bcast
- bcast_intra_seg3: replaced both lcomm->bcast and llcomm->bcast
by isend/irecvs
*/
if ( COLL_HIERARCH_SEG_BCAST_ALG == bcast_alg ) {
ret = mca_coll_hierarch_bcast_intra_seg ( buff, count, datatype, root,
comm, module, segsize );
}
else if ( COLL_HIERARCH_SEG1_BCAST_ALG == bcast_alg ) {
ret = mca_coll_hierarch_bcast_intra_seg1 ( buff, count, datatype, root,
comm, module, segsize );
}
else if ( COLL_HIERARCH_SEG2_BCAST_ALG == bcast_alg ) {
ret = mca_coll_hierarch_bcast_intra_seg2 ( buff, count, datatype, root,
comm, module, segsize );
}
else if ( COLL_HIERARCH_SEG3_BCAST_ALG == bcast_alg ) {
ret = mca_coll_hierarch_bcast_intra_seg3 ( buff, count, datatype, root,
comm, module, segsize );
}
else {
/* Segment size of zero forces the entire message to be bcasted
as a single segment. */
ret = mca_coll_hierarch_bcast_intra_seg ( buff, count, datatype, root,
comm, module, 0 );
}
return ret;
}
static int mca_coll_hierarch_bcast_intra_seg (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize )
{
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
int lroot, llroot;
int rank, ret=OMPI_SUCCESS;
int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED;
int rank=0, ret=OMPI_SUCCESS;
MPI_Aint ub=0, typeext=0;
size_t typesize=0;
int realsegsize=0, remaining_count=0;
int num_segments=0, segcount=0, segindex=0;
char* tmpbuf = (char *) buff;
rank = ompi_comm_rank ( comm );
lcomm = hierarch_module->hier_lcomm;
if ( mca_coll_hierarch_verbose_param ) {
printf("%s:%d: executing hierarchical bcast with cnt=%d and root=%d\n",
comm->c_name, rank, count, root );
printf("%s:%d: executing hierarchical seg bcast with cnt=%d root=%d, segsize=%d\n",
comm->c_name, rank, count, root, segsize );
}
/*
/*
* This function returns the local leader communicator
* which *always* contains the root of this operation.
* This might involve creating a new communicator. This is
* also the reason, that *every* process in comm has to call
* This might involve creating a new communicator. This is
* also the reason, that *every* process in comm has to call
* this function
*/
llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);
/* Bcast on the upper level among the local leaders */
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(buff, count, datatype, llroot, llcomm,
llcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
return ret;
ompi_ddt_type_size ( datatype, &typesize);
ompi_ddt_get_extent ( datatype, &typeext, &ub);
/* Determine number of segments and number of elements per segment */
if ((typesize > 0) && (segsize % typesize != 0)) {
/* segment size must be a multiple of typesize */
segsize = typesize * (segsize / typesize);
}
if ((segsize == 0) || (count == 0) || (typesize == 0)) {
segcount = count;
num_segments = 1;
}
else {
segcount = segsize/typesize;
num_segments = count/segcount;
if ( (count % segcount) != 0 ) {
num_segments++;
}
if (num_segments == 1) {
segcount = count;
}
}
/* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-leve communicator.
realsegsize = segcount*typeext;
remaining_count = segcount;
for (segindex = 0; segindex < num_segments; segindex++) {
/* determine how many elements are being sent in this round */
if( segindex == (num_segments - 1) ) {
remaining_count = count - segindex*segcount;
}
/* Bcast on the upper level among the local leaders */
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count,
datatype, llroot, llcomm,
llcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
/* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count,
datatype, lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
tmpbuf += realsegsize;
}
return ret;
}
static int mca_coll_hierarch_bcast_intra_seg1 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize )
{
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED;
int llrank=0, llsize=0, rank=0, ret=OMPI_SUCCESS;
MPI_Aint ub=0, typeext=0;
size_t typesize=0;
int i, realsegsize=0, remaining_count=0;
int num_segments=0, segcount=0, segindex=0;
char* tmpbuf = (char *) buff;
ompi_request_t **req=NULL;
rank = ompi_comm_rank ( comm );
lcomm = hierarch_module->hier_lcomm;
if ( mca_coll_hierarch_verbose_param ) {
printf("%s:%d: executing hierarchical seg1 bcast with cnt=%d root=%d segsize=%d\n",
comm->c_name, rank, count, root, segsize );
}
/*
* This function returns the local leader communicator
* which *always* contains the root of this operation.
* This might involve creating a new communicator. This is
* also the reason, that *every* process in comm has to call
* this function
*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(buff, count, datatype, lroot, lcomm,
lcomm->c_coll.coll_bcast_module );
llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);
ompi_ddt_type_size ( datatype, &typesize);
ompi_ddt_get_extent ( datatype, &typeext, &ub);
/* Determine number of segments and number of elements per segment */
if ((typesize > 0) && (segsize % typesize != 0)) {
/* segment size must be a multiple of typesize */
segsize = typesize * (segsize / typesize);
}
if ((segsize == 0) || (count == 0) || (typesize == 0)) {
segcount = count;
num_segments = 1;
}
else {
segcount = segsize/typesize;
num_segments = count/segcount;
if ( (count % segcount) != 0 ) {
num_segments++;
}
if (num_segments == 1) {
segcount = count;
}
}
return ret;
realsegsize = segcount*typeext;
tmpbuf = (char *) buff;
remaining_count = segcount;
if ( MPI_COMM_NULL != llcomm ) {
llrank = ompi_comm_rank ( llcomm );
llsize = ompi_comm_size (llcomm);
req = (ompi_request_t **)malloc ( (llsize) * sizeof(ompi_request_t *));
if ( NULL == req ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(i=0; i<llsize; i++) {
req[i] = MPI_REQUEST_NULL;
}
}
/* Broadcasting the first segment in the upper level*/
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
llroot, llcomm,
llcomm->c_coll.coll_bcast_module );
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
/* Since the first segment has already been bcasted, this loop
starts at 1 and not with segment 0 */
for (segindex = 1; segindex < num_segments; segindex++) {
/* determine how many elements are being sent in this round */
if( segindex == (num_segments - 1) ) {
remaining_count = count - segindex*segcount;
}
tmpbuf += realsegsize;
/* Broadcasting the next segment in the upper level using non blocking
operations*/
if ( MPI_COMM_NULL != llcomm ) {
if(llrank == llroot) {
for(i = 0; i < llsize; i++) {
if( i != llroot) {
ret = MCA_PML_CALL(isend(tmpbuf, remaining_count, datatype, i,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,
llcomm, &(req[i])));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
}
else {
ret = MCA_PML_CALL(irecv(tmpbuf, remaining_count, datatype, llroot,
MCA_COLL_BASE_TAG_BCAST,
llcomm, &(req[i+1])));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
/* broadcasting the before segment among the lower level processes
using blocking operations*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf-realsegsize, segcount,
datatype, lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
ret = ompi_request_wait_all(llsize, req, MPI_STATUSES_IGNORE);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
/* Bcasting the last segment among the lower level processes using blocking operations
* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
}
exit:
if ( NULL != req ) {
free ( req );
}
return ret;
}
static int mca_coll_hierarch_bcast_intra_seg2 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize )
{
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED;
int llsize=0, rank=0, ret=OMPI_SUCCESS;
int lsize=0, lrank=0;
MPI_Aint ub=0, typeext=0;
size_t typesize=0;
int i, realsegsize=0, remaining_count=0;
int num_segments=0, segcount=0, segindex=0;
char* tmpbuf = (char *) buff;
ompi_request_t **req=NULL;
rank = ompi_comm_rank ( comm );
lcomm = hierarch_module->hier_lcomm;
if ( mca_coll_hierarch_verbose_param ) {
printf("%s:%d: executing hierarchical seg2 bcast with cnt=%d root=%d segsize=%d\n",
comm->c_name, rank, count, root, segsize );
}
/*
* This function returns the local leader communicator
* which *always* contains the root of this operation.
* This might involve creating a new communicator. This is
* also the reason, that *every* process in comm has to call
* this function
*/
llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);
ompi_ddt_type_size ( datatype, &typesize);
ompi_ddt_get_extent ( datatype, &typeext, &ub);
/* Determine number of segments and number of elements per segment */
if ((typesize > 0) && (segsize % typesize != 0)) {
/* segment size must be a multiple of typesize */
segsize = typesize * (segsize / typesize);
}
if ((segsize == 0) || (count == 0) || (typesize == 0)) {
segcount = count;
num_segments = 1;
}
else {
segcount = segsize/typesize;
num_segments = count/segcount;
if ( (count % segcount) != 0 ) {
num_segments++;
}
if (num_segments == 1) {
segcount = count;
}
}
realsegsize = segcount*typeext;
tmpbuf = (char *) buff;
remaining_count = segcount;
lsize = ompi_comm_size (lcomm);
req = (ompi_request_t **)malloc ( (lsize) * sizeof(ompi_request_t *));
if ( NULL == req ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(i=0; i<lsize; i++) {
req[i] = MPI_REQUEST_NULL;
}
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
llroot, llcomm,
llcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
if ( MPI_COMM_NULL != lcomm ) {
lrank = ompi_comm_rank ( lcomm );
}
for (segindex = 1; segindex < num_segments; segindex++) {
/* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.*/
if ( MPI_COMM_NULL != lcomm ) {
if(lrank == lroot) {
for(i = 0; i < lsize; i++) {
if( i != lroot) {
ret = MCA_PML_CALL(isend(tmpbuf, remaining_count, datatype, i,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,
lcomm, &(req[i])));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
}
else {
ret = MCA_PML_CALL(irecv(tmpbuf, remaining_count, datatype, lroot,
MCA_COLL_BASE_TAG_BCAST, lcomm, &(req[i+1])));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
/* determine how many elements are being sent in this round */
if( segindex == (num_segments - 1) ) {
remaining_count = count - segindex*segcount;
}
tmpbuf += realsegsize;
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
llroot, llcomm,
llcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
ret = ompi_request_wait_all ( lsize, req, MPI_STATUSES_IGNORE);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
/* Bcasting the last segment among the lower level processes
* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
}
exit:
if ( NULL != req ) {
free ( req );
}
return ret;
}
static int mca_coll_hierarch_bcast_intra_seg3 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize )
{
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED;
int llrank=0, llsize=0, rank=0, ret=OMPI_SUCCESS;
int lsize=0, lrank=0;
MPI_Aint ub=0, typeext=0;
size_t typesize=0;
int i, realsegsize=0, remaining_count=0;
int num_segments=0, segcount=0, segindex=0;
char* tmpbuf = (char *) buff;
ompi_request_t **req=NULL, **req1=NULL;
rank = ompi_comm_rank ( comm );
lcomm = hierarch_module->hier_lcomm;
if ( mca_coll_hierarch_verbose_param ) {
printf("%s:%d: executing hierarchical seg3 bcast with cnt=%d root=%d segsize=%d\n",
comm->c_name, rank, count, root, segsize );
}
/*
* This function returns the local leader communicator
* which *always* contains the root of this operation.
* This might involve creating a new communicator. This is
* also the reason, that *every* process in comm has to call
* this function
*/
llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);
ompi_ddt_type_size ( datatype, &typesize);
ompi_ddt_get_extent ( datatype, &typeext, &ub);
/* Determine number of segments and number of elements per segment */
if ((typesize > 0) && (segsize % typesize != 0)) {
/* segment size must be a multiple of typesize */
segsize = typesize * (segsize / typesize);
}
if ((segsize == 0) || (count == 0) || (typesize == 0)) {
segcount = count;
num_segments = 1;
} else {
segcount = segsize/typesize;
num_segments = count/segcount;
if ( (count % segcount) != 0 ) num_segments++;
if (num_segments == 1) segcount = count;
}
realsegsize = segcount*typeext;
tmpbuf = (char *) buff;
remaining_count = segcount;
lsize = ompi_comm_size (lcomm);
req1 = (ompi_request_t **)malloc ( lsize * sizeof(ompi_request_t *));
if ( NULL == req1 ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(i=0; i<lsize; i++) {
req1[i] = MPI_REQUEST_NULL;
}
if ( MPI_COMM_NULL != llcomm ) {
llsize = ompi_comm_size (llcomm);
llrank = ompi_comm_rank ( llcomm );
req = (ompi_request_t **)malloc ( llsize * sizeof(ompi_request_t *));
if ( NULL == req ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(i=0; i<llsize; i++) {
req1[i] = MPI_REQUEST_NULL;
}
}
/* Broadcasting the first segment in the upper level*/
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
llroot, llcomm,
llcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
for (segindex = 1; segindex < num_segments; segindex++) {
/* determine how many elements are being sent in this round */
if( segindex == (num_segments - 1) ) {
remaining_count = count - segindex*segcount;
}
tmpbuf += realsegsize;
/* Broadcasting the next segment in the upper level*/
if ( MPI_COMM_NULL != llcomm ) {
if(llrank == llroot) {
for(i = 0; i < llsize; i++) {
if( i != llroot) {
ret = MCA_PML_CALL(isend(tmpbuf, remaining_count, datatype, i,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,
llcomm, (req+i) ));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
}
else {
ret = MCA_PML_CALL(irecv(tmpbuf, remaining_count, datatype, llroot,
MCA_COLL_BASE_TAG_BCAST,
llcomm, (req) ));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
/* broadcasting the before segment among the lower level processes
* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
if(lrank == lroot) {
for(i = 0; i < lsize; i++) {
if( i != lroot) {
ret = MCA_PML_CALL(isend(tmpbuf-realsegsize, segcount, datatype, i,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,
lcomm, (req1+i) ));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
}
else {
ret = MCA_PML_CALL(irecv(tmpbuf-realsegsize, segcount, datatype, lroot,
MCA_COLL_BASE_TAG_BCAST , lcomm, req1 ));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
/* Wait for the upper level bcast to complete*/
ret = ompi_request_wait_all(llsize, req, MPI_STATUSES_IGNORE);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
/*Wait for the lower level bcast to complete */
ret = ompi_request_wait_all(lsize, req1, MPI_STATUSES_IGNORE);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
/*Bcasting the last segment among the lower level processes
* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
exit:
if ( NULL != req ) {
free ( req );
}
if ( NULL != req1 ) {
free ( req1 );
}
return ret;
}

Просмотреть файл

@ -46,6 +46,8 @@ int mca_coll_hierarch_verbose_param=0;
int mca_coll_hierarch_use_rdma_param=0;
int mca_coll_hierarch_ignore_sm_param=0;
int mca_coll_hierarch_detection_alg_param=2;
int mca_coll_hierarch_bcast_alg_param=COLL_HIERARCH_BASIC_BCAST_ALG;
int mca_coll_hierarch_segsize_param=32768;
/*
* Local function
@ -122,10 +124,23 @@ static int hierarch_open(void)
mca_base_param_reg_int(&mca_coll_hierarch_component.collm_version,
"detection_alg",
"Used to specify the algorithm for detecting Hierarchy."
"To specify all levels or two levels of hierarchy",
"Choose between all or two levels of hierarchy",
false, false, mca_coll_hierarch_detection_alg_param,
&mca_coll_hierarch_detection_alg_param);
mca_base_param_reg_int(&mca_coll_hierarch_component.collm_version,
"bcast_alg",
"Used to specify the algorithm used for bcast operations.",
false, false, mca_coll_hierarch_bcast_alg_param,
&mca_coll_hierarch_bcast_alg_param);
mca_base_param_reg_int(&mca_coll_hierarch_component.collm_version,
"segment_size",
"Used to specify the segment size for segmented algorithms.",
false, false, mca_coll_hierarch_segsize_param,
&mca_coll_hierarch_segsize_param);
return OMPI_SUCCESS;
}