1
1
openmpi/ompi/mca/coll/hierarch/coll_hierarch_bcast.c

754 строки
23 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2008 University of Houston. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "coll_hierarch.h"
#include "mpi.h"
#include "ompi/constants.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/coll/base/coll_tags.h"
- Split the datatype engine into two parts: an MPI specific part in OMPI and a language agnostic part in OPAL. The convertor is completely moved into OPAL. This offers several benefits as described in RFC http://www.open-mpi.org/community/lists/devel/2009/07/6387.php namely: - Fewer basic types (int* and float* types, boolean and wchar - Fixing naming scheme to ompi-nomenclature. - Usability outside of the ompi-layer. - Due to the fixed nature of simple opal types, their information is completely known at compile time and therefore constified - With fewer datatypes (22), the actual sizes of bit-field types may be reduced from 64 to 32 bits, allowing reorganizing the opal_datatype structure, eliminating holes and keeping data required in convertor (upon send/recv) in one cacheline... This has implications to the convertor-datastructure and other parts of the code. - Several performance tests have been run, the netpipe latency does not change with this patch on Linux/x86-64 on the smoky cluster. - Extensive tests have been done to verify correctness (no new regressions) using: 1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and ompi-ddt: a. running both trunk and ompi-ddt resulted in no differences (except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run correctly). b. with --enable-memchecker and running under valgrind (one buglet when run with static found in test-suite, commited) 2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt: all passed (except for the dynamic/ tests failed!! as trunk/MTT) 3. compilation and usage of HDF5 tests on Jaguar using PGI and PathScale compilers. 4. compilation and usage on Scicortex. - Please note, that for the heterogeneous case, (-m32 compiled binaries/ompi), neither ompi-trunk, nor ompi-ddt branch would successfully launch. This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
#include "ompi/datatype/ompi_datatype.h"
#include "ompi/mca/pml/pml.h"
/*
* bcast_intra
*
* Function: - broadcast using hierarchical algorithm
* Accepts: - same arguments as MPI_Bcast()
* Returns: - MPI_SUCCESS or error code
*/
static int mca_coll_hierarch_bcast_intra_seg (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize );
static int mca_coll_hierarch_bcast_intra_seg1 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize );
static int mca_coll_hierarch_bcast_intra_seg2 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize );
static int mca_coll_hierarch_bcast_intra_seg3 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize );
int mca_coll_hierarch_bcast_intra(void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int bcast_alg = mca_coll_hierarch_bcast_alg_param;
int segsize = mca_coll_hierarch_segsize_param;
int ret=OMPI_SUCCESS;
/* Here is a brief description on what we try to evaluate:
- bcast_intra_seg used the bcast of lcomm and llcomm, similarly
to original algorithm in hierarch. However, it can segment
the message, such that we might get an overlap between the two
layers. This overlap is based on the assumption, that a process
might be done early with a bcast and can start the next one.
- bcast_intra_seg1: replaces the llcomm->bcast by isend/irecvs
to increase the overlap, keeps the lcomm->bcast however
- bcast_intra_seg2: replaced lcomm->bcast by isend/irecvs
to increase the overlap, keeps however llcomm->bcast
- bcast_intra_seg3: replaced both lcomm->bcast and llcomm->bcast
by isend/irecvs
*/
if ( COLL_HIERARCH_SEG_BCAST_ALG == bcast_alg ) {
ret = mca_coll_hierarch_bcast_intra_seg ( buff, count, datatype, root,
comm, module, segsize );
}
else if ( COLL_HIERARCH_SEG1_BCAST_ALG == bcast_alg ) {
ret = mca_coll_hierarch_bcast_intra_seg1 ( buff, count, datatype, root,
comm, module, segsize );
}
else if ( COLL_HIERARCH_SEG2_BCAST_ALG == bcast_alg ) {
ret = mca_coll_hierarch_bcast_intra_seg2 ( buff, count, datatype, root,
comm, module, segsize );
}
else if ( COLL_HIERARCH_SEG3_BCAST_ALG == bcast_alg ) {
ret = mca_coll_hierarch_bcast_intra_seg3 ( buff, count, datatype, root,
comm, module, segsize );
}
else {
/* Segment size of zero forces the entire message to be bcasted
as a single segment. */
ret = mca_coll_hierarch_bcast_intra_seg ( buff, count, datatype, root,
comm, module, 0 );
}
return ret;
}
static int mca_coll_hierarch_bcast_intra_seg (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize )
{
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED;
int rank=0, ret=OMPI_SUCCESS;
MPI_Aint ub=0, typeext=0;
size_t typesize=0;
int realsegsize=0, remaining_count=0;
int num_segments=0, segcount=0, segindex=0;
char* tmpbuf = (char *) buff;
rank = ompi_comm_rank ( comm );
lcomm = hierarch_module->hier_lcomm;
if ( mca_coll_hierarch_verbose_param ) {
printf("%s:%d: executing hierarchical seg bcast with cnt=%d root=%d, segsize=%d\n",
comm->c_name, rank, count, root, segsize );
}
/*
* This function returns the local leader communicator
* which *always* contains the root of this operation.
* This might involve creating a new communicator. This is
* also the reason, that *every* process in comm has to call
* this function
*/
llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);
- Split the datatype engine into two parts: an MPI specific part in OMPI and a language agnostic part in OPAL. The convertor is completely moved into OPAL. This offers several benefits as described in RFC http://www.open-mpi.org/community/lists/devel/2009/07/6387.php namely: - Fewer basic types (int* and float* types, boolean and wchar - Fixing naming scheme to ompi-nomenclature. - Usability outside of the ompi-layer. - Due to the fixed nature of simple opal types, their information is completely known at compile time and therefore constified - With fewer datatypes (22), the actual sizes of bit-field types may be reduced from 64 to 32 bits, allowing reorganizing the opal_datatype structure, eliminating holes and keeping data required in convertor (upon send/recv) in one cacheline... This has implications to the convertor-datastructure and other parts of the code. - Several performance tests have been run, the netpipe latency does not change with this patch on Linux/x86-64 on the smoky cluster. - Extensive tests have been done to verify correctness (no new regressions) using: 1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and ompi-ddt: a. running both trunk and ompi-ddt resulted in no differences (except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run correctly). b. with --enable-memchecker and running under valgrind (one buglet when run with static found in test-suite, commited) 2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt: all passed (except for the dynamic/ tests failed!! as trunk/MTT) 3. compilation and usage of HDF5 tests on Jaguar using PGI and PathScale compilers. 4. compilation and usage on Scicortex. - Please note, that for the heterogeneous case, (-m32 compiled binaries/ompi), neither ompi-trunk, nor ompi-ddt branch would successfully launch. This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype, &typesize);
ompi_datatype_get_extent ( datatype, &ub, &typeext);
/* Determine number of segments and number of elements per segment */
if ((typesize > 0) && (segsize % typesize != 0)) {
/* segment size must be a multiple of typesize */
segsize = typesize * (segsize / typesize);
}
if ((segsize == 0) || (count == 0) || (typesize == 0)) {
segcount = count;
num_segments = 1;
}
else {
segcount = segsize/typesize;
num_segments = count/segcount;
if ( (count % segcount) != 0 ) {
num_segments++;
}
if (num_segments == 1) {
segcount = count;
}
}
realsegsize = segcount*typeext;
remaining_count = segcount;
for (segindex = 0; segindex < num_segments; segindex++) {
/* determine how many elements are being sent in this round */
if( segindex == (num_segments - 1) ) {
remaining_count = count - segindex*segcount;
}
/* Bcast on the upper level among the local leaders */
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count,
datatype, llroot, llcomm,
llcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
/* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count,
datatype, lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
tmpbuf += realsegsize;
}
return ret;
}
static int mca_coll_hierarch_bcast_intra_seg1 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize )
{
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED;
int llrank=0, llsize=0, rank=0, ret=OMPI_SUCCESS;
MPI_Aint ub=0, typeext=0;
size_t typesize=0;
int i, realsegsize=0, remaining_count=0;
int num_segments=0, segcount=0, segindex=0;
char* tmpbuf = (char *) buff;
ompi_request_t **sreq=NULL;
ompi_request_t *rreq=MPI_REQUEST_NULL;
rank = ompi_comm_rank ( comm );
lcomm = hierarch_module->hier_lcomm;
if ( mca_coll_hierarch_verbose_param ) {
printf("%s:%d: executing hierarchical seg1 bcast with cnt=%d root=%d segsize=%d\n",
comm->c_name, rank, count, root, segsize );
}
/*
* This function returns the local leader communicator
* which *always* contains the root of this operation.
* This might involve creating a new communicator. This is
* also the reason, that *every* process in comm has to call
* this function
*/
llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);
- Split the datatype engine into two parts: an MPI specific part in OMPI and a language agnostic part in OPAL. The convertor is completely moved into OPAL. This offers several benefits as described in RFC http://www.open-mpi.org/community/lists/devel/2009/07/6387.php namely: - Fewer basic types (int* and float* types, boolean and wchar - Fixing naming scheme to ompi-nomenclature. - Usability outside of the ompi-layer. - Due to the fixed nature of simple opal types, their information is completely known at compile time and therefore constified - With fewer datatypes (22), the actual sizes of bit-field types may be reduced from 64 to 32 bits, allowing reorganizing the opal_datatype structure, eliminating holes and keeping data required in convertor (upon send/recv) in one cacheline... This has implications to the convertor-datastructure and other parts of the code. - Several performance tests have been run, the netpipe latency does not change with this patch on Linux/x86-64 on the smoky cluster. - Extensive tests have been done to verify correctness (no new regressions) using: 1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and ompi-ddt: a. running both trunk and ompi-ddt resulted in no differences (except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run correctly). b. with --enable-memchecker and running under valgrind (one buglet when run with static found in test-suite, commited) 2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt: all passed (except for the dynamic/ tests failed!! as trunk/MTT) 3. compilation and usage of HDF5 tests on Jaguar using PGI and PathScale compilers. 4. compilation and usage on Scicortex. - Please note, that for the heterogeneous case, (-m32 compiled binaries/ompi), neither ompi-trunk, nor ompi-ddt branch would successfully launch. This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype, &typesize);
ompi_datatype_get_extent ( datatype, &ub, &typeext);
/* Determine number of segments and number of elements per segment */
if ((typesize > 0) && (segsize % typesize != 0)) {
/* segment size must be a multiple of typesize */
segsize = typesize * (segsize / typesize);
}
if ((segsize == 0) || (count == 0) || (typesize == 0)) {
segcount = count;
num_segments = 1;
}
else {
segcount = segsize/typesize;
num_segments = count/segcount;
if ( (count % segcount) != 0 ) {
num_segments++;
}
if (num_segments == 1) {
segcount = count;
}
}
realsegsize = segcount*typeext;
remaining_count = segcount;
if ( MPI_COMM_NULL != llcomm ) {
llrank = ompi_comm_rank ( llcomm );
llsize = ompi_comm_size ( llcomm);
sreq = hierarch_module->hier_reqs;
for(i=0; i<llsize; i++) {
sreq[i] = MPI_REQUEST_NULL;
}
}
/* Broadcasting the first segment in the upper level*/
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
llroot, llcomm,
llcomm->c_coll.coll_bcast_module );
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
/* Since the first segment has already been bcasted, this loop
starts at 1 and not with segment 0 */
for (segindex = 1; segindex < num_segments; segindex++) {
/* determine how many elements are being sent in this round */
if( segindex == (num_segments - 1) ) {
remaining_count = count - segindex*segcount;
}
tmpbuf += realsegsize;
/* Broadcasting the next segment in the upper level using non blocking
operations*/
if ( MPI_COMM_NULL != llcomm ) {
if( llrank == llroot) {
for( i = 0; i < llsize; i++) {
if( i != llroot) {
ret = MCA_PML_CALL(isend(tmpbuf, remaining_count, datatype, i,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,
llcomm, &(sreq[i])));
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
}
}
else {
ret = MCA_PML_CALL(irecv(tmpbuf, remaining_count, datatype, llroot,
MCA_COLL_BASE_TAG_BCAST,
llcomm, &rreq ));
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
}
/* broadcasting the before segment among the lower level processes
using blocking operations*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf-realsegsize, segcount,
datatype, lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
if ( MPI_COMM_NULL != llcomm ) {
if ( llrank == llroot ) {
ret = ompi_request_wait_all( llsize, sreq, MPI_STATUSES_IGNORE);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
else {
ret = ompi_request_wait_all(1, &rreq, MPI_STATUS_IGNORE);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
}
}
/* Bcasting the last segment among the lower level processes using blocking operations
* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
}
return ret;
}
static int mca_coll_hierarch_bcast_intra_seg2 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize )
{
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED;
int rank=0, ret=OMPI_SUCCESS;
int lsize=0, lrank=0;
MPI_Aint ub=0, typeext=0;
size_t typesize=0;
int i, realsegsize=0, remaining_count=0;
int num_segments=0, segcount=0, segindex=0;
char* tmpbuf = (char *) buff;
ompi_request_t **sreq=NULL;
ompi_request_t *rreq=MPI_REQUEST_NULL;
rank = ompi_comm_rank ( comm );
lcomm = hierarch_module->hier_lcomm;
if ( mca_coll_hierarch_verbose_param ) {
printf("%s:%d: executing hierarchical seg2 bcast with cnt=%d root=%d segsize=%d\n",
comm->c_name, rank, count, root, segsize );
}
/*
* This function returns the local leader communicator
* which *always* contains the root of this operation.
* This might involve creating a new communicator. This is
* also the reason, that *every* process in comm has to call
* this function
*/
llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);
- Split the datatype engine into two parts: an MPI specific part in OMPI and a language agnostic part in OPAL. The convertor is completely moved into OPAL. This offers several benefits as described in RFC http://www.open-mpi.org/community/lists/devel/2009/07/6387.php namely: - Fewer basic types (int* and float* types, boolean and wchar - Fixing naming scheme to ompi-nomenclature. - Usability outside of the ompi-layer. - Due to the fixed nature of simple opal types, their information is completely known at compile time and therefore constified - With fewer datatypes (22), the actual sizes of bit-field types may be reduced from 64 to 32 bits, allowing reorganizing the opal_datatype structure, eliminating holes and keeping data required in convertor (upon send/recv) in one cacheline... This has implications to the convertor-datastructure and other parts of the code. - Several performance tests have been run, the netpipe latency does not change with this patch on Linux/x86-64 on the smoky cluster. - Extensive tests have been done to verify correctness (no new regressions) using: 1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and ompi-ddt: a. running both trunk and ompi-ddt resulted in no differences (except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run correctly). b. with --enable-memchecker and running under valgrind (one buglet when run with static found in test-suite, commited) 2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt: all passed (except for the dynamic/ tests failed!! as trunk/MTT) 3. compilation and usage of HDF5 tests on Jaguar using PGI and PathScale compilers. 4. compilation and usage on Scicortex. - Please note, that for the heterogeneous case, (-m32 compiled binaries/ompi), neither ompi-trunk, nor ompi-ddt branch would successfully launch. This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype, &typesize);
ompi_datatype_get_extent ( datatype, &ub, &typeext);
/* Determine number of segments and number of elements per segment */
if ((typesize > 0) && (segsize % typesize != 0)) {
/* segment size must be a multiple of typesize */
segsize = typesize * (segsize / typesize);
}
if ((segsize == 0) || (count == 0) || (typesize == 0)) {
segcount = count;
num_segments = 1;
}
else {
segcount = segsize/typesize;
num_segments = count/segcount;
if ( (count % segcount) != 0 ) {
num_segments++;
}
if (num_segments == 1) {
segcount = count;
}
}
realsegsize = segcount*typeext;
remaining_count = segcount;
lsize = ompi_comm_size (lcomm);
sreq = hierarch_module->hier_reqs;
for(i=0; i<lsize; i++) {
sreq[i] = MPI_REQUEST_NULL;
}
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
llroot, llcomm,
llcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
if ( MPI_COMM_NULL != lcomm ) {
lrank = ompi_comm_rank ( lcomm );
}
for (segindex = 1; segindex < num_segments; segindex++) {
/* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.*/
if ( MPI_COMM_NULL != lcomm ) {
if(lrank == lroot) {
for(i = 0; i < lsize; i++) {
if( i != lroot) {
ret = MCA_PML_CALL(isend(tmpbuf, remaining_count, datatype, i,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,
lcomm, &(sreq[i])));
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
}
}
else {
ret = MCA_PML_CALL(irecv(tmpbuf, remaining_count, datatype, lroot,
MCA_COLL_BASE_TAG_BCAST, lcomm, &rreq));
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
}
/* determine how many elements are being sent in this round */
if( segindex == (num_segments - 1) ) {
remaining_count = count - segindex*segcount;
}
tmpbuf += realsegsize;
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
llroot, llcomm,
llcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
if ( MPI_COMM_NULL != lcomm ) {
if ( lrank == lroot ) {
ret = ompi_request_wait_all ( lsize, sreq, MPI_STATUSES_IGNORE);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
else {
ret = ompi_request_wait_all ( 1, &rreq, MPI_STATUS_IGNORE);
if ( OMPI_SUCCESS != ret ) {
return ret;
}
}
}
}
/* Bcasting the last segment among the lower level processes
* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
}
return ret;
}
static int mca_coll_hierarch_bcast_intra_seg3 (void *buff,
int count,
struct ompi_datatype_t *datatype,
int root,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int segsize )
{
struct ompi_communicator_t *llcomm=NULL;
struct ompi_communicator_t *lcomm=NULL;
mca_coll_hierarch_module_t *hierarch_module = (mca_coll_hierarch_module_t *) module;
int lroot=MPI_UNDEFINED, llroot=MPI_UNDEFINED;
int llrank=MPI_UNDEFINED, llsize=0, rank=0, ret=OMPI_SUCCESS;
int lsize=0, lrank=MPI_UNDEFINED;
MPI_Aint ub=0, typeext=0;
size_t typesize=0;
int i, realsegsize=0, remaining_count=0;
int num_segments=0, segcount=0, segindex=0;
char* tmpbuf = (char *) buff;
ompi_request_t **sreq=NULL, **sreq1=NULL;
ompi_request_t *rreq=MPI_REQUEST_NULL, *rreq1=MPI_REQUEST_NULL;
rank = ompi_comm_rank ( comm );
lcomm = hierarch_module->hier_lcomm;
if ( mca_coll_hierarch_verbose_param ) {
printf("%s:%d: executing hierarchical seg3 bcast with cnt=%d root=%d segsize=%d\n",
comm->c_name, rank, count, root, segsize );
}
/*
* This function returns the local leader communicator
* which *always* contains the root of this operation.
* This might involve creating a new communicator. This is
* also the reason, that *every* process in comm has to call
* this function
*/
llcomm = mca_coll_hierarch_get_llcomm ( root, hierarch_module, &llroot, &lroot);
- Split the datatype engine into two parts: an MPI specific part in OMPI and a language agnostic part in OPAL. The convertor is completely moved into OPAL. This offers several benefits as described in RFC http://www.open-mpi.org/community/lists/devel/2009/07/6387.php namely: - Fewer basic types (int* and float* types, boolean and wchar - Fixing naming scheme to ompi-nomenclature. - Usability outside of the ompi-layer. - Due to the fixed nature of simple opal types, their information is completely known at compile time and therefore constified - With fewer datatypes (22), the actual sizes of bit-field types may be reduced from 64 to 32 bits, allowing reorganizing the opal_datatype structure, eliminating holes and keeping data required in convertor (upon send/recv) in one cacheline... This has implications to the convertor-datastructure and other parts of the code. - Several performance tests have been run, the netpipe latency does not change with this patch on Linux/x86-64 on the smoky cluster. - Extensive tests have been done to verify correctness (no new regressions) using: 1. mpi_test_suite on linux/x86-64 using clean ompi-trunk and ompi-ddt: a. running both trunk and ompi-ddt resulted in no differences (except for MPI_SHORT_INT and MPI_TYPE_MIX_LB_UB do now run correctly). b. with --enable-memchecker and running under valgrind (one buglet when run with static found in test-suite, commited) 2. ibm testsuite on linux/x86-64 using clean ompi-trunk and ompi-ddt: all passed (except for the dynamic/ tests failed!! as trunk/MTT) 3. compilation and usage of HDF5 tests on Jaguar using PGI and PathScale compilers. 4. compilation and usage on Scicortex. - Please note, that for the heterogeneous case, (-m32 compiled binaries/ompi), neither ompi-trunk, nor ompi-ddt branch would successfully launch. This commit was SVN r21641.
2009-07-13 08:56:31 +04:00
ompi_datatype_type_size ( datatype, &typesize);
ompi_datatype_get_extent ( datatype, &ub, &typeext);
/* Determine number of segments and number of elements per segment */
if ((typesize > 0) && (segsize % typesize != 0)) {
/* segment size must be a multiple of typesize */
segsize = typesize * (segsize / typesize);
}
if ((segsize == 0) || (count == 0) || (typesize == 0)) {
segcount = count;
num_segments = 1;
} else {
segcount = segsize/typesize;
num_segments = count/segcount;
if ( (count % segcount) != 0 ) num_segments++;
if (num_segments == 1) segcount = count;
}
realsegsize = segcount*typeext;
remaining_count = segcount;
if ( MPI_COMM_NULL != lcomm ) {
lsize = ompi_comm_size ( lcomm );
lrank = ompi_comm_rank ( lcomm );
sreq1 = (ompi_request_t **)malloc ( lsize * sizeof(ompi_request_t *));
if ( NULL == sreq1 ) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(i=0; i<lsize; i++) {
sreq1[i] = MPI_REQUEST_NULL;
}
}
if ( MPI_COMM_NULL != llcomm ) {
llsize = ompi_comm_size (llcomm);
llrank = ompi_comm_rank ( llcomm );
sreq = hierarch_module->hier_reqs;
for(i=0; i<llsize; i++) {
sreq[i] = MPI_REQUEST_NULL;
}
}
/* Broadcasting the first segment in the upper level*/
if ( MPI_UNDEFINED != llroot ) {
ret = llcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
llroot, llcomm,
llcomm->c_coll.coll_bcast_module);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
for (segindex = 1; segindex < num_segments; segindex++) {
/* determine how many elements are being sent in this round */
if( segindex == (num_segments - 1) ) {
remaining_count = count - segindex*segcount;
}
tmpbuf += realsegsize;
/* Broadcasting the next segment in the upper level*/
if ( MPI_COMM_NULL != llcomm ) {
if(llrank == llroot) {
for(i = 0; i < llsize; i++) {
if( i != llroot) {
ret = MCA_PML_CALL(isend(tmpbuf, remaining_count, datatype, i,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,
llcomm, (sreq+i) ));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
}
else {
ret = MCA_PML_CALL(irecv(tmpbuf, remaining_count, datatype, llroot,
MCA_COLL_BASE_TAG_BCAST,
llcomm, &rreq ));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
/* broadcasting the before segment among the lower level processes
* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
if( lrank == lroot) {
for( i = 0; i < lsize; i++) {
if( i != lroot) {
ret = MCA_PML_CALL(isend(tmpbuf-realsegsize, segcount, datatype, i,
MCA_COLL_BASE_TAG_BCAST,
MCA_PML_BASE_SEND_STANDARD,
lcomm, (sreq1+i) ));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
}
else {
ret = MCA_PML_CALL(irecv(tmpbuf-realsegsize, segcount, datatype, lroot,
MCA_COLL_BASE_TAG_BCAST , lcomm, &rreq1 ));
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
/* Wait for the upper level bcast to complete*/
if ( MPI_COMM_NULL != llcomm ) {
if ( llrank == llroot ) {
ret = ompi_request_wait_all(llsize, sreq, MPI_STATUSES_IGNORE);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
else {
ret = ompi_request_wait_all ( 1, &rreq, MPI_STATUS_IGNORE );
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
/*Wait for the lower level bcast to complete */
if ( MPI_COMM_NULL != lcomm ) {
if ( lrank == lroot ) {
ret = ompi_request_wait_all(lsize, sreq1, MPI_STATUSES_IGNORE);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
else {
ret = ompi_request_wait_all( 1, &rreq1, MPI_STATUS_IGNORE);
if ( OMPI_SUCCESS != ret ) {
goto exit;
}
}
}
}
/*Bcasting the last segment among the lower level processes
* once the local leaders got the data from the root, they can distribute
* it to the processes in their local, low-level communicator.
*/
if ( MPI_COMM_NULL != lcomm ) {
ret = lcomm->c_coll.coll_bcast(tmpbuf, remaining_count, datatype,
lroot, lcomm,
lcomm->c_coll.coll_bcast_module);
}
exit:
if ( NULL != sreq1 ) {
free ( sreq1 );
}
return ret;
}