Complete the dismantle of the tuned module.
Этот коммит содержится в:
родитель
aa019e239e
Коммит
211f05fb09
@ -22,7 +22,8 @@ headers += \
|
||||
base/base.h \
|
||||
base/coll_tags.h \
|
||||
base/coll_base_topo.h \
|
||||
base/coll_base_util.h
|
||||
base/coll_base_util.h \
|
||||
base/coll_base_functions.h
|
||||
|
||||
libmca_coll_la_SOURCES += \
|
||||
base/coll_base_comm_select.c \
|
||||
|
@ -2,7 +2,7 @@
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
# Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
@ -19,29 +19,25 @@
|
||||
|
||||
sources = \
|
||||
coll_tuned.h \
|
||||
coll_tuned_topo.h \
|
||||
coll_tuned_util.h \
|
||||
coll_tuned_dynamic_file.h \
|
||||
coll_tuned_dynamic_rules.h \
|
||||
coll_tuned_topo.c \
|
||||
coll_tuned_util.c \
|
||||
coll_tuned_decision_fixed.c \
|
||||
coll_tuned_decision_dynamic.c \
|
||||
coll_tuned_dynamic_file.c \
|
||||
coll_tuned_dynamic_rules.c \
|
||||
coll_tuned_allreduce.c \
|
||||
coll_tuned_alltoall.c \
|
||||
coll_tuned_alltoallv.c \
|
||||
coll_tuned_allgather.c \
|
||||
coll_tuned_allgatherv.c \
|
||||
coll_tuned_barrier.c \
|
||||
coll_tuned_bcast.c \
|
||||
coll_tuned_reduce.c \
|
||||
coll_tuned_reduce_scatter.c \
|
||||
coll_tuned_gather.c \
|
||||
coll_tuned_scatter.c \
|
||||
coll_tuned_component.c \
|
||||
coll_tuned_module.c
|
||||
coll_tuned_module.c \
|
||||
coll_tuned_allgather_decision.c \
|
||||
coll_tuned_allgatherv_decision.c \
|
||||
coll_tuned_allreduce_decision.c \
|
||||
coll_tuned_alltoall_decision.c \
|
||||
coll_tuned_gather_decision.c \
|
||||
coll_tuned_alltoallv_decision.c \
|
||||
coll_tuned_barrier_decision.c \
|
||||
coll_tuned_reduce_decision.c \
|
||||
coll_tuned_bcast_decision.c \
|
||||
coll_tuned_reduce_scatter_decision.c \
|
||||
coll_tuned_scatter_decision.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
|
@ -18,7 +18,7 @@
|
||||
#include "mpi.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/mca/coll/base/coll_base.h"
|
||||
#include "ompi/mca/coll/base/coll_base_functions.h"
|
||||
|
||||
/* also need the dynamic rule structures */
|
||||
#include "coll_tuned_dynamic_rules.h"
|
||||
|
@ -272,56 +272,13 @@ static int tuned_close(void)
|
||||
static void
|
||||
mca_coll_tuned_module_construct(mca_coll_tuned_module_t *module)
|
||||
{
|
||||
module->tuned_data = NULL;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
mca_coll_tuned_module_destruct(mca_coll_tuned_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_comm_t *data;
|
||||
|
||||
/* Free the space in the data mpool and the data hanging off the
|
||||
communicator */
|
||||
|
||||
data = module->tuned_data;
|
||||
if (NULL != data) {
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
/* Reset the reqs to NULL/0 -- they'll be freed as part of freeing
|
||||
the generel c_coll_selected_data */
|
||||
data->mcct_reqs = NULL;
|
||||
data->mcct_num_reqs = 0;
|
||||
#endif
|
||||
|
||||
/* free any cached information that has been allocated */
|
||||
if (data->cached_ntree) { /* destroy general tree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_ntree);
|
||||
}
|
||||
if (data->cached_bintree) { /* destroy bintree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_bintree);
|
||||
}
|
||||
if (data->cached_bmtree) { /* destroy bmtree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_bmtree);
|
||||
}
|
||||
if (data->cached_in_order_bmtree) { /* destroy bmtree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bmtree);
|
||||
}
|
||||
if (data->cached_chain) { /* destroy general chain if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_chain);
|
||||
}
|
||||
if (data->cached_pipeline) { /* destroy pipeline if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_pipeline);
|
||||
}
|
||||
if (data->cached_in_order_bintree) { /* destroy in order bintree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&data->cached_in_order_bintree);
|
||||
}
|
||||
|
||||
free(data);
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
|
||||
for( int i = 0; i < COLLCOUNT; i++ ) {
|
||||
tuned_module->user_forced[i].algorithm = 0;
|
||||
tuned_module->com_rules[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
OBJ_CLASS_INSTANCE(mca_coll_tuned_module_t,
|
||||
mca_coll_base_module_t,
|
||||
mca_coll_tuned_module_construct,
|
||||
mca_coll_tuned_module_destruct);
|
||||
OBJ_CLASS_INSTANCE(mca_coll_tuned_module_t, mca_coll_base_module_t,
|
||||
mca_coll_tuned_module_construct, NULL);
|
||||
|
@ -2,18 +2,18 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2012 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -28,13 +28,10 @@
|
||||
#include "ompi/mca/coll/base/coll_tags.h"
|
||||
#include "coll_tuned.h"
|
||||
|
||||
#include "coll_tuned.h"
|
||||
|
||||
|
||||
/*
|
||||
* Notes on evaluation rules and ordering
|
||||
*
|
||||
* The order is:
|
||||
* Notes on evaluation rules and ordering
|
||||
*
|
||||
* The order is:
|
||||
* use file based rules if presented (-coll_tuned_dynamic_rules_filename = rules)
|
||||
* Else
|
||||
* use forced rules (-coll_tuned_dynamic_ALG_intra_algorithm = algorithm-number)
|
||||
@ -58,12 +55,11 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[ALLREDUCE]) {
|
||||
if (tuned_module->com_rules[ALLREDUCE]) {
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
size_t dsize;
|
||||
@ -71,7 +67,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
ompi_datatype_type_size (dtype, &dsize);
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLREDUCE],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLREDUCE],
|
||||
dsize, &faninout, &segsize, &ignoreme);
|
||||
|
||||
if (alg) {
|
||||
@ -82,7 +78,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[ALLREDUCE].algorithm) {
|
||||
if (tuned_module->user_forced[ALLREDUCE].algorithm) {
|
||||
return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
@ -91,27 +87,26 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
}
|
||||
|
||||
/*
|
||||
* alltoall_intra_dec
|
||||
* alltoall_intra_dec
|
||||
*
|
||||
* Function: - seletects alltoall algorithm to use
|
||||
* Accepts: - same arguments as MPI_Alltoall()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the bcast implementation)
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[ALLTOALL]) {
|
||||
if (tuned_module->com_rules[ALLTOALL]) {
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int comsize;
|
||||
int alg, faninout, segsize, max_requests;
|
||||
@ -121,7 +116,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
comsize = ompi_comm_size(comm);
|
||||
dsize *= (ptrdiff_t)comsize * (ptrdiff_t)scount;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLTOALL],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLTOALL],
|
||||
dsize, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
@ -133,7 +128,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[ALLTOALL].algorithm) {
|
||||
if (tuned_module->user_forced[ALLTOALL].algorithm) {
|
||||
return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
@ -152,12 +147,11 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *sdisps,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts, int *rdisps,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoallv_intra_dec_dynamic"));
|
||||
|
||||
@ -167,10 +161,10 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
|
||||
* This allow the users to specify the alltoallv algorithm to be used only
|
||||
* based on the communicator size.
|
||||
*/
|
||||
if (data->com_rules[ALLTOALLV]) {
|
||||
if (tuned_module->com_rules[ALLTOALLV]) {
|
||||
int alg, faninout, segsize, max_requests;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLTOALLV],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLTOALLV],
|
||||
0, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
@ -182,7 +176,7 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[ALLTOALLV].algorithm) {
|
||||
if (tuned_module->user_forced[ALLTOALLV].algorithm) {
|
||||
return ompi_coll_tuned_alltoallv_intra_do_forced(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps, rdtype,
|
||||
comm, module);
|
||||
@ -193,7 +187,7 @@ int ompi_coll_tuned_alltoallv_intra_dec_dynamic(void *sbuf, int *scounts, int *s
|
||||
}
|
||||
|
||||
/*
|
||||
* barrier_intra_dec
|
||||
* barrier_intra_dec
|
||||
*
|
||||
* Function: - seletects barrier algorithm to use
|
||||
* Accepts: - same arguments as MPI_Barrier()
|
||||
@ -203,16 +197,15 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_barrier_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[BARRIER]) {
|
||||
if (tuned_module->com_rules[BARRIER]) {
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[BARRIER],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[BARRIER],
|
||||
0, &faninout, &segsize, &ignoreme);
|
||||
|
||||
if (alg) {
|
||||
@ -222,14 +215,14 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[BARRIER].algorithm) {
|
||||
if (tuned_module->user_forced[BARRIER].algorithm) {
|
||||
return ompi_coll_tuned_barrier_intra_do_forced (comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_barrier_intra_dec_fixed (comm, module);
|
||||
}
|
||||
|
||||
/*
|
||||
* bcast_intra_dec
|
||||
* bcast_intra_dec
|
||||
*
|
||||
* Function: - seletects broadcast algorithm to use
|
||||
* Accepts: - same arguments as MPI_Bcast()
|
||||
@ -241,12 +234,11 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:bcast_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[BCAST]) {
|
||||
if (tuned_module->com_rules[BCAST]) {
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
size_t dsize;
|
||||
@ -254,7 +246,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
ompi_datatype_type_size (datatype, &dsize);
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[BCAST],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[BCAST],
|
||||
dsize, &faninout, &segsize, &ignoreme);
|
||||
|
||||
if (alg) {
|
||||
@ -266,7 +258,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
} /*end if any com rules to check */
|
||||
|
||||
|
||||
if (data->user_forced[BCAST].algorithm) {
|
||||
if (tuned_module->user_forced[BCAST].algorithm) {
|
||||
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root,
|
||||
comm, module);
|
||||
}
|
||||
@ -275,12 +267,12 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
}
|
||||
|
||||
/*
|
||||
* reduce_intra_dec
|
||||
* reduce_intra_dec
|
||||
*
|
||||
* Function: - seletects reduce algorithm to use
|
||||
* Accepts: - same arguments as MPI_reduce()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the reduce implementation)
|
||||
*
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
int count, struct ompi_datatype_t* datatype,
|
||||
@ -289,12 +281,11 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[REDUCE]) {
|
||||
if (tuned_module->com_rules[REDUCE]) {
|
||||
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize, max_requests;
|
||||
@ -303,21 +294,21 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
ompi_datatype_type_size (datatype, &dsize);
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[REDUCE],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[REDUCE],
|
||||
dsize, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for this message size */
|
||||
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
|
||||
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
|
||||
op, root,
|
||||
comm, module,
|
||||
alg, faninout,
|
||||
segsize,
|
||||
alg, faninout,
|
||||
segsize,
|
||||
max_requests);
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[REDUCE].algorithm) {
|
||||
if (tuned_module->user_forced[REDUCE].algorithm) {
|
||||
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype,
|
||||
op, root,
|
||||
comm, module);
|
||||
@ -328,15 +319,15 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
}
|
||||
|
||||
/*
|
||||
* reduce_scatter_intra_dec
|
||||
* reduce_scatter_intra_dec
|
||||
*
|
||||
* Function: - seletects reduce_scatter algorithm to use
|
||||
* Accepts: - same arguments as MPI_Reduce_scatter()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from
|
||||
* the reduce_scatter implementation)
|
||||
*
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
int *rcounts,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
@ -344,13 +335,12 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_scatter_intra_dec_dynamic"));
|
||||
|
||||
/* check to see if we have some filebased rules */
|
||||
if (data->com_rules[REDUCESCATTER]) {
|
||||
/* we do, so calc the message size or what ever we need and use
|
||||
if (tuned_module->com_rules[REDUCESCATTER]) {
|
||||
/* we do, so calc the message size or what ever we need and use
|
||||
this for the evaluation */
|
||||
int alg, faninout, segsize, ignoreme, i, count, size;
|
||||
size_t dsize;
|
||||
@ -359,21 +349,21 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
ompi_datatype_type_size (dtype, &dsize);
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[REDUCESCATTER],
|
||||
dsize, &faninout,
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[REDUCESCATTER],
|
||||
dsize, &faninout,
|
||||
&segsize, &ignoreme);
|
||||
if (alg) {
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for this message size */
|
||||
return ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module,
|
||||
alg, faninout,
|
||||
alg, faninout,
|
||||
segsize);
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[REDUCESCATTER].algorithm) {
|
||||
return ompi_coll_tuned_reduce_scatter_intra_do_forced (sbuf, rbuf, rcounts,
|
||||
|
||||
if (tuned_module->user_forced[REDUCESCATTER].algorithm) {
|
||||
return ompi_coll_tuned_reduce_scatter_intra_do_forced (sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
@ -383,7 +373,7 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
}
|
||||
|
||||
/*
|
||||
* allgather_intra_dec
|
||||
* allgather_intra_dec
|
||||
*
|
||||
* Function: - seletects allgather algorithm to use
|
||||
* Accepts: - same arguments as MPI_Allgather()
|
||||
@ -391,58 +381,57 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(void *sbuf, void *rbuf,
|
||||
* allgather function).
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_allgather_intra_dec_dynamic"));
|
||||
|
||||
if (data->com_rules[ALLGATHER]) {
|
||||
|
||||
if (tuned_module->com_rules[ALLGATHER]) {
|
||||
/* We have file based rules:
|
||||
- calculate message size and other necessary information */
|
||||
int comsize;
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
size_t dsize;
|
||||
|
||||
|
||||
ompi_datatype_type_size (sdtype, &dsize);
|
||||
comsize = ompi_comm_size(comm);
|
||||
dsize *= (ptrdiff_t)comsize * (ptrdiff_t)scount;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHER],
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHER],
|
||||
dsize, &faninout, &segsize, &ignoreme);
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for
|
||||
this message size */
|
||||
return ompi_coll_tuned_allgather_intra_do_this (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module,
|
||||
alg, faninout, segsize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We do not have file based rules */
|
||||
if (data->user_forced[ALLGATHER].algorithm) {
|
||||
if (tuned_module->user_forced[ALLGATHER].algorithm) {
|
||||
/* User-forced algorithm */
|
||||
return ompi_coll_tuned_allgather_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
return ompi_coll_tuned_allgather_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/* Use default decision */
|
||||
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
return ompi_coll_tuned_allgather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/*
|
||||
* allgatherv_intra_dec
|
||||
* allgatherv_intra_dec
|
||||
*
|
||||
* Function: - seletects allgatherv algorithm to use
|
||||
* Accepts: - same arguments as MPI_Allgatherv()
|
||||
@ -450,71 +439,69 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
* allgatherv function).
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_allgatherv_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts,
|
||||
void* rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_allgatherv_intra_dec_dynamic"));
|
||||
|
||||
if (data->com_rules[ALLGATHERV]) {
|
||||
|
||||
if (tuned_module->com_rules[ALLGATHERV]) {
|
||||
/* We have file based rules:
|
||||
- calculate message size and other necessary information */
|
||||
int comsize, i;
|
||||
int alg, faninout, segsize, ignoreme;
|
||||
size_t dsize, total_size;
|
||||
|
||||
comsize = ompi_comm_size(comm);
|
||||
comsize = ompi_comm_size(comm);
|
||||
ompi_datatype_type_size (sdtype, &dsize);
|
||||
total_size = 0;
|
||||
for (i = 0; i < comsize; i++) { total_size += dsize * rcounts[i]; }
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[ALLGATHERV],
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHERV],
|
||||
total_size, &faninout, &segsize, &ignoreme);
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for
|
||||
if (alg) {
|
||||
/* we have found a valid choice from the file based rules for
|
||||
this message size */
|
||||
return ompi_coll_tuned_allgatherv_intra_do_this (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
comm, module,
|
||||
alg, faninout, segsize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* We do not have file based rules */
|
||||
if (data->user_forced[ALLGATHERV].algorithm) {
|
||||
if (tuned_module->user_forced[ALLGATHERV].algorithm) {
|
||||
/* User-forced algorithm */
|
||||
return ompi_coll_tuned_allgatherv_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
return ompi_coll_tuned_allgatherv_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/* Use default decision */
|
||||
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
return ompi_coll_tuned_allgatherv_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcounts,
|
||||
rdispls, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_gather_intra_dec_dynamic"));
|
||||
@ -522,15 +509,15 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
/**
|
||||
* check to see if we have some filebased rules.
|
||||
*/
|
||||
if (data->com_rules[GATHER]) {
|
||||
if (tuned_module->com_rules[GATHER]) {
|
||||
int comsize, alg, faninout, segsize, max_requests;
|
||||
size_t dsize;
|
||||
|
||||
comsize = ompi_comm_size(comm);
|
||||
comsize = ompi_comm_size(comm);
|
||||
ompi_datatype_type_size (sdtype, &dsize);
|
||||
dsize *= comsize;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[GATHER],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[GATHER],
|
||||
dsize, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
@ -542,26 +529,25 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(void *sbuf, int scount,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[GATHER].algorithm) {
|
||||
if (tuned_module->user_forced[GATHER].algorithm) {
|
||||
return ompi_coll_tuned_gather_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
return ompi_coll_tuned_gather_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
|
||||
mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_scatter_intra_dec_dynamic"));
|
||||
@ -569,15 +555,15 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
||||
/**
|
||||
* check to see if we have some filebased rules.
|
||||
*/
|
||||
if (data->com_rules[SCATTER]) {
|
||||
if (tuned_module->com_rules[SCATTER]) {
|
||||
int comsize, alg, faninout, segsize, max_requests;
|
||||
size_t dsize;
|
||||
|
||||
comsize = ompi_comm_size(comm);
|
||||
comsize = ompi_comm_size(comm);
|
||||
ompi_datatype_type_size (sdtype, &dsize);
|
||||
dsize *= comsize;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (data->com_rules[SCATTER],
|
||||
alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[SCATTER],
|
||||
dsize, &faninout, &segsize, &max_requests);
|
||||
|
||||
if (alg) {
|
||||
@ -589,13 +575,13 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(void *sbuf, int scount,
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (data->user_forced[SCATTER].algorithm) {
|
||||
if (tuned_module->user_forced[SCATTER].algorithm) {
|
||||
return ompi_coll_tuned_scatter_intra_do_forced (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
return ompi_coll_tuned_scatter_intra_dec_fixed (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
@ -3,10 +3,10 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2012 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
@ -14,9 +14,9 @@
|
||||
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -31,7 +31,6 @@
|
||||
#include "ompi/op/op.h"
|
||||
#include "coll_tuned.h"
|
||||
|
||||
|
||||
/*
|
||||
* allreduce_intra
|
||||
*
|
||||
@ -40,11 +39,11 @@
|
||||
* Returns: - MPI_SUCCESS or error code
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
ompi_coll_tuned_allreduce_intra_dec_fixed(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
size_t dsize, block_dsize;
|
||||
int comm_size = ompi_comm_size(comm);
|
||||
@ -53,8 +52,8 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
||||
|
||||
/**
|
||||
* Decision function based on MX results from the Grig cluster at UTK.
|
||||
*
|
||||
* Currently, linear, recursive doubling, and nonoverlapping algorithms
|
||||
*
|
||||
* Currently, linear, recursive doubling, and nonoverlapping algorithms
|
||||
* can handle both commutative and non-commutative operations.
|
||||
* Ring algorithm does not support non-commutative operations.
|
||||
*/
|
||||
@ -62,40 +61,40 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
||||
block_dsize = dsize * (ptrdiff_t)count;
|
||||
|
||||
if (block_dsize < intermediate_message) {
|
||||
return (ompi_coll_tuned_allreduce_intra_recursivedoubling (sbuf, rbuf,
|
||||
count, dtype,
|
||||
op, comm, module));
|
||||
}
|
||||
return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
|
||||
count, dtype,
|
||||
op, comm, module));
|
||||
}
|
||||
|
||||
if( ompi_op_is_commute(op) && (count > comm_size) ) {
|
||||
const size_t segment_size = 1 << 20; /* 1 MB */
|
||||
if (((size_t)comm_size * (size_t)segment_size >= block_dsize)) {
|
||||
return (ompi_coll_tuned_allreduce_intra_ring (sbuf, rbuf, count, dtype,
|
||||
op, comm, module));
|
||||
return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype,
|
||||
op, comm, module));
|
||||
} else {
|
||||
return (ompi_coll_tuned_allreduce_intra_ring_segmented (sbuf, rbuf,
|
||||
count, dtype,
|
||||
op, comm, module,
|
||||
segment_size));
|
||||
return (ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf,
|
||||
count, dtype,
|
||||
op, comm, module,
|
||||
segment_size));
|
||||
}
|
||||
}
|
||||
|
||||
return (ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count,
|
||||
dtype, op, comm, module));
|
||||
return (ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count,
|
||||
dtype, op, comm, module));
|
||||
}
|
||||
|
||||
/*
|
||||
* alltoall_intra_dec
|
||||
* alltoall_intra_dec
|
||||
*
|
||||
* Function: - seletects alltoall algorithm to use
|
||||
* Accepts: - same arguments as MPI_Alltoall()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the bcast implementation)
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
@ -109,12 +108,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
|
||||
/* special case */
|
||||
if (communicator_size==2) {
|
||||
return ompi_coll_tuned_alltoall_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoall_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/* Decision function based on measurement on Grig cluster at
|
||||
/* Decision function based on measurement on Grig cluster at
|
||||
the University of Tennessee (2GB MX) up to 64 nodes.
|
||||
Has better performance for messages of intermediate sizes than the old one */
|
||||
/* determine block size */
|
||||
@ -123,19 +122,19 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
|
||||
if ((block_dsize < (size_t) ompi_coll_tuned_alltoall_small_msg)
|
||||
&& (communicator_size > 12)) {
|
||||
return ompi_coll_tuned_alltoall_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
|
||||
} else if (block_dsize < (size_t) ompi_coll_tuned_alltoall_intermediate_msg) {
|
||||
return ompi_coll_tuned_alltoall_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
|
||||
#if 0
|
||||
/* previous decision */
|
||||
@ -148,12 +147,12 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
ompi_comm_rank(comm), communicator_size, total_dsize));
|
||||
|
||||
if (communicator_size >= 12 && total_dsize <= 768) {
|
||||
return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
return ompi_coll_base_alltoall_intra_bruck(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
}
|
||||
if (total_dsize <= 131072) {
|
||||
return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
return ompi_coll_base_alltoall_intra_basic_linear(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
return ompi_coll_base_alltoall_intra_pairwise(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -170,14 +169,14 @@ int ompi_coll_tuned_alltoallv_intra_dec_fixed(void *sbuf, int *scounts, int *sdi
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
/* For starters, just keep the original algorithm. */
|
||||
return ompi_coll_tuned_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps,rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_alltoallv_intra_pairwise(sbuf, scounts, sdisps, sdtype,
|
||||
rbuf, rcounts, rdisps,rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* barrier_intra_dec
|
||||
* barrier_intra_dec
|
||||
*
|
||||
* Function: - seletects barrier algorithm to use
|
||||
* Accepts: - same arguments as MPI_Barrier()
|
||||
@ -192,7 +191,7 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
|
||||
communicator_size));
|
||||
|
||||
if( 2 == communicator_size )
|
||||
return ompi_coll_tuned_barrier_intra_two_procs(comm, module);
|
||||
return ompi_coll_base_barrier_intra_two_procs(comm, module);
|
||||
/**
|
||||
* Basic optimisation. If we have a power of 2 number of nodes
|
||||
* the use the recursive doubling algorithm, otherwise
|
||||
@ -203,19 +202,17 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm,
|
||||
for( ; communicator_size > 0; communicator_size >>= 1 ) {
|
||||
if( communicator_size & 0x1 ) {
|
||||
if( has_one )
|
||||
return ompi_coll_tuned_barrier_intra_bruck(comm, module);
|
||||
return ompi_coll_base_barrier_intra_bruck(comm, module);
|
||||
has_one = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ompi_coll_tuned_barrier_intra_recursivedoubling(comm, module);
|
||||
/* return ompi_coll_tuned_barrier_intra_linear(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_doublering(comm); */
|
||||
return ompi_coll_base_barrier_intra_recursivedoubling(comm, module);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* bcast_intra_dec
|
||||
* bcast_intra_dec
|
||||
*
|
||||
* Function: - seletects broadcast algorithm to use
|
||||
* Accepts: - same arguments as MPI_Bcast()
|
||||
@ -226,14 +223,14 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
/* Decision function based on MX results for
|
||||
/* Decision function based on MX results for
|
||||
messages up to 36MB and communicator sizes up to 64 nodes */
|
||||
const size_t small_message_size = 2048;
|
||||
const size_t intermediate_message_size = 370728;
|
||||
const double a_p16 = 3.2118e-6; /* [1 / byte] */
|
||||
const double b_p16 = 8.7936;
|
||||
const double b_p16 = 8.7936;
|
||||
const double a_p64 = 2.3679e-6; /* [1 / byte] */
|
||||
const double b_p64 = 1.1787;
|
||||
const double b_p64 = 1.1787;
|
||||
const double a_p128 = 1.6134e-6; /* [1 / byte] */
|
||||
const double b_p128 = 2.1102;
|
||||
|
||||
@ -251,95 +248,95 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
" root %d rank %d com_size %d msg_length %lu",
|
||||
root, ompi_comm_rank(comm), communicator_size, (unsigned long)message_size));
|
||||
|
||||
/* Handle messages of small and intermediate size, and
|
||||
/* Handle messages of small and intermediate size, and
|
||||
single-element broadcasts */
|
||||
if ((message_size < small_message_size) || (count <= 1)) {
|
||||
/* Binomial without segmentation */
|
||||
segsize = 0;
|
||||
return ompi_coll_tuned_bcast_intra_binomial (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_binomial(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
} else if (message_size < intermediate_message_size) {
|
||||
/* SplittedBinary with 1KB segments */
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
}
|
||||
}
|
||||
/* Handle large message sizes */
|
||||
else if (communicator_size < (a_p128 * message_size + b_p128)) {
|
||||
/* Pipeline with 128KB segments */
|
||||
segsize = 1024 << 7;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
} else if (communicator_size < 13) {
|
||||
/* Split Binary with 8KB segments */
|
||||
segsize = 1024 << 3;
|
||||
return ompi_coll_tuned_bcast_intra_split_bintree(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
return ompi_coll_base_bcast_intra_split_bintree(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
} else if (communicator_size < (a_p64 * message_size + b_p64)) {
|
||||
/* Pipeline with 64KB segments */
|
||||
segsize = 1024 << 6;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
} else if (communicator_size < (a_p16 * message_size + b_p16)) {
|
||||
/* Pipeline with 16KB segments */
|
||||
segsize = 1024 << 4;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
|
||||
}
|
||||
|
||||
/* Pipeline with 8KB segments */
|
||||
segsize = 1024 << 3;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype,
|
||||
root, comm, module,
|
||||
segsize);
|
||||
#if 0
|
||||
/* this is based on gige measurements */
|
||||
|
||||
if (communicator_size < 4) {
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
|
||||
return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
|
||||
}
|
||||
if (communicator_size == 4) {
|
||||
if (message_size < 524288) segsize = 0;
|
||||
else segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
|
||||
return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
|
||||
}
|
||||
if (communicator_size <= 8 && message_size < 4096) {
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm, module);
|
||||
return ompi_coll_base_bcast_intra_basic_linear(buff, count, datatype, root, comm, module);
|
||||
}
|
||||
if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
|
||||
return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
|
||||
}
|
||||
if (message_size >= 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, module, segsize);
|
||||
return ompi_coll_base_bcast_intra_pipeline(buff, count, datatype, root, comm, module, segsize);
|
||||
}
|
||||
segsize = 0;
|
||||
/* once tested can swap this back in */
|
||||
/* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, module, segsize);
|
||||
/* return ompi_coll_base_bcast_intra_bmtree(buff, count, datatype, root, comm, segsize); */
|
||||
return ompi_coll_base_bcast_intra_bintree(buff, count, datatype, root, comm, module, segsize);
|
||||
#endif /* 0 */
|
||||
}
|
||||
|
||||
/*
|
||||
* reduce_intra_dec
|
||||
* reduce_intra_dec
|
||||
*
|
||||
* Function: - seletects reduce algorithm to use
|
||||
* Accepts: - same arguments as MPI_reduce()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the reduce implementation)
|
||||
*
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
int count, struct ompi_datatype_t* datatype,
|
||||
@ -367,15 +364,15 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
message_size = dsize * (ptrdiff_t)count; /* needed for decision */
|
||||
|
||||
/**
|
||||
* If the operation is non commutative we currently have choice of linear
|
||||
* If the operation is non commutative we currently have choice of linear
|
||||
* or in-order binary tree algorithm.
|
||||
*/
|
||||
if( !ompi_op_is_commute(op) ) {
|
||||
if ((communicator_size < 12) && (message_size < 2048)) {
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
0, max_requests);
|
||||
return ompi_coll_base_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
}
|
||||
return ompi_coll_base_reduce_intra_in_order_binary (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
0, max_requests);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed "
|
||||
@ -384,27 +381,27 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
|
||||
if ((communicator_size < 8) && (message_size < 512)){
|
||||
/* Linear_0K */
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
} else if (((communicator_size < 8) && (message_size < 20480)) ||
|
||||
(message_size < 2048) || (count <= 1)) {
|
||||
/* Binomial_0K */
|
||||
segsize = 0;
|
||||
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
} else if (communicator_size > (a1 * message_size + b1)) {
|
||||
/* Binomial_1K */
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
return ompi_coll_base_reduce_intra_binomial(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
} else if (communicator_size > (a2 * message_size + b2)) {
|
||||
/* Pipeline_1K */
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
} else if (communicator_size > (a3 * message_size + b3)) {
|
||||
/* Binary_32K */
|
||||
segsize = 32*1024;
|
||||
return ompi_coll_tuned_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
|
||||
return ompi_coll_base_reduce_intra_binary( sendbuf, recvbuf, count, datatype, op, root,
|
||||
comm, module, segsize, max_requests);
|
||||
}
|
||||
if (communicator_size > (a4 * message_size + b4)) {
|
||||
@ -414,8 +411,8 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
/* Pipeline_64K */
|
||||
segsize = 64*1024;
|
||||
}
|
||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
|
||||
#if 0
|
||||
/* for small messages use linear algorithm */
|
||||
@ -424,8 +421,7 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
fanout = communicator_size - 1;
|
||||
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
|
||||
/* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
/* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
|
||||
return ompi_coll_base_reduce_intra_basic_linear(sendbuf, recvbuf, count, datatype, op, root, comm, module);
|
||||
}
|
||||
if (message_size < 524288) {
|
||||
if (message_size <= 65536 ) {
|
||||
@ -437,21 +433,21 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
}
|
||||
/* later swap this for a binary tree */
|
||||
/* fanout = 2; */
|
||||
return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, fanout, max_requests);
|
||||
return ompi_coll_base_reduce_intra_chain(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, fanout, max_requests);
|
||||
}
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
return ompi_coll_base_reduce_intra_pipeline(sendbuf, recvbuf, count, datatype, op, root, comm, module,
|
||||
segsize, max_requests);
|
||||
#endif /* 0 */
|
||||
}
|
||||
|
||||
/*
|
||||
* reduce_scatter_intra_dec
|
||||
* reduce_scatter_intra_dec
|
||||
*
|
||||
* Function: - seletects reduce_scatter algorithm to use
|
||||
* Accepts: - same arguments as MPI_Reduce_scatter()
|
||||
* Returns: - MPI_SUCCESS or error code (passed from
|
||||
* Returns: - MPI_SUCCESS or error code (passed from
|
||||
* the reduce scatter implementation)
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
|
||||
@ -474,16 +470,16 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
|
||||
/* We need data size for decision function */
|
||||
ompi_datatype_type_size(dtype, &dsize);
|
||||
total_message_size = 0;
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
for (i = 0; i < comm_size; i++) {
|
||||
total_message_size += rcounts[i];
|
||||
}
|
||||
|
||||
if( !ompi_op_is_commute(op) ) {
|
||||
return ompi_coll_tuned_reduce_scatter_intra_nonoverlapping (sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
|
||||
total_message_size *= dsize;
|
||||
|
||||
/* compute the nearest power of 2 */
|
||||
@ -492,18 +488,18 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
|
||||
if ((total_message_size <= small_message_size) ||
|
||||
((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
|
||||
(comm_size >= a * total_message_size + b)) {
|
||||
return
|
||||
ompi_coll_tuned_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
|
||||
return
|
||||
ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
|
||||
dtype, op,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/*
|
||||
* allgather_intra_dec
|
||||
* allgather_intra_dec
|
||||
*
|
||||
* Function: - seletects allgather algorithm to use
|
||||
* Accepts: - same arguments as MPI_Allgather()
|
||||
@ -511,10 +507,10 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( void *sbuf, void *rbuf,
|
||||
* internal allgather function.
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
@ -525,78 +521,78 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
|
||||
|
||||
/* Special case for 2 processes */
|
||||
if (communicator_size == 2) {
|
||||
return ompi_coll_tuned_allgather_intra_two_procs (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
/* Determine complete data size */
|
||||
ompi_datatype_type_size(sdtype, &dsize);
|
||||
total_dsize = dsize * (ptrdiff_t)scount * (ptrdiff_t)communicator_size;
|
||||
|
||||
total_dsize = dsize * (ptrdiff_t)scount * (ptrdiff_t)communicator_size;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allgather_intra_dec_fixed"
|
||||
" rank %d com_size %d msg_length %lu",
|
||||
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
|
||||
|
||||
pow2_size = opal_next_poweroftwo_inclusive (communicator_size);
|
||||
|
||||
/* Decision based on MX 2Gb results from Grig cluster at
|
||||
The University of Tennesse, Knoxville
|
||||
- if total message size is less than 50KB use either bruck or
|
||||
recursive doubling for non-power of two and power of two nodes,
|
||||
/* Decision based on MX 2Gb results from Grig cluster at
|
||||
The University of Tennesse, Knoxville
|
||||
- if total message size is less than 50KB use either bruck or
|
||||
recursive doubling for non-power of two and power of two nodes,
|
||||
respectively.
|
||||
- else use ring and neighbor exchange algorithms for odd and even
|
||||
- else use ring and neighbor exchange algorithms for odd and even
|
||||
number of nodes, respectively.
|
||||
*/
|
||||
if (total_dsize < 50000) {
|
||||
if (pow2_size == communicator_size) {
|
||||
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} else {
|
||||
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
} else {
|
||||
if (communicator_size % 2) {
|
||||
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} else {
|
||||
return ompi_coll_tuned_allgather_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if defined(USE_MPICH2_DECISION)
|
||||
/* Decision as in MPICH-2
|
||||
presented in Thakur et.al. "Optimization of Collective Communication
|
||||
Operations in MPICH", International Journal of High Performance Computing
|
||||
/* Decision as in MPICH-2
|
||||
presented in Thakur et.al. "Optimization of Collective Communication
|
||||
Operations in MPICH", International Journal of High Performance Computing
|
||||
Applications, Vol. 19, No. 1, 49-66 (2005)
|
||||
- for power-of-two processes and small and medium size messages
|
||||
- for power-of-two processes and small and medium size messages
|
||||
(up to 512KB) use recursive doubling
|
||||
- for non-power-of-two processes and small messages (80KB) use bruck,
|
||||
- for everything else use ring.
|
||||
*/
|
||||
if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
|
||||
return ompi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} else if (total_dsize <= 81920) {
|
||||
return ompi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
} else if (total_dsize <= 81920) {
|
||||
return ompi_coll_base_allgather_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
return ompi_coll_base_allgather_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
comm, module);
|
||||
#endif /* defined(USE_MPICH2_DECISION) */
|
||||
}
|
||||
|
||||
/*
|
||||
* allgatherv_intra_dec
|
||||
* allgatherv_intra_dec
|
||||
*
|
||||
* Function: - seletects allgatherv algorithm to use
|
||||
* Accepts: - same arguments as MPI_Allgatherv()
|
||||
@ -604,59 +600,59 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(void *sbuf, int scount,
|
||||
* internal allgatherv function.
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int *rcounts,
|
||||
void* rbuf, int *rcounts,
|
||||
int *rdispls,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
int i;
|
||||
int communicator_size;
|
||||
size_t dsize, total_dsize;
|
||||
|
||||
|
||||
communicator_size = ompi_comm_size(comm);
|
||||
|
||||
|
||||
/* Special case for 2 processes */
|
||||
if (communicator_size == 2) {
|
||||
return ompi_coll_tuned_allgatherv_intra_two_procs (sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
|
||||
|
||||
/* Determine complete data size */
|
||||
ompi_datatype_type_size(sdtype, &dsize);
|
||||
total_dsize = 0;
|
||||
for (i = 0; i < communicator_size; i++) {
|
||||
total_dsize += dsize * (ptrdiff_t)rcounts[i];
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_allgatherv_intra_dec_fixed"
|
||||
" rank %d com_size %d msg_length %lu",
|
||||
ompi_comm_rank(comm), communicator_size, (unsigned long)total_dsize));
|
||||
|
||||
|
||||
/* Decision based on allgather decision. */
|
||||
if (total_dsize < 50000) {
|
||||
return ompi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgatherv_intra_bruck(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
} else {
|
||||
if (communicator_size % 2) {
|
||||
return ompi_coll_tuned_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgatherv_intra_ring(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
} else {
|
||||
return ompi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
return ompi_coll_base_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
|
||||
rbuf, rcounts, rdispls, rdtype,
|
||||
comm, module);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* gather_intra_dec
|
||||
* gather_intra_dec
|
||||
*
|
||||
* Function: - seletects gather algorithm to use
|
||||
* Accepts: - same arguments as MPI_Gather()
|
||||
@ -664,10 +660,10 @@ int ompi_coll_tuned_allgatherv_intra_dec_fixed(void *sbuf, int scount,
|
||||
* internal allgather function.
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
@ -685,7 +681,7 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
int communicator_size, rank;
|
||||
size_t dsize, block_size;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_gather_intra_dec_fixed"));
|
||||
|
||||
communicator_size = ompi_comm_size(comm);
|
||||
@ -701,33 +697,32 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
}
|
||||
|
||||
if (block_size > large_block_size) {
|
||||
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
large_segment_size);
|
||||
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
large_segment_size);
|
||||
|
||||
} else if (block_size > intermediate_block_size) {
|
||||
return ompi_coll_tuned_gather_intra_linear_sync (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
small_segment_size);
|
||||
return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module,
|
||||
small_segment_size);
|
||||
|
||||
} else if ((communicator_size > large_communicator_size) ||
|
||||
((communicator_size > small_communicator_size) &&
|
||||
(block_size < small_block_size))) {
|
||||
return ompi_coll_tuned_gather_intra_binomial (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
|
||||
return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
/* Otherwise, use basic linear */
|
||||
return ompi_coll_tuned_gather_intra_basic_linear (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
return ompi_coll_base_gather_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
||||
/*
|
||||
* scatter_intra_dec
|
||||
* scatter_intra_dec
|
||||
*
|
||||
* Function: - seletects scatter algorithm to use
|
||||
* Accepts: - same arguments as MPI_Scatter()
|
||||
@ -735,10 +730,10 @@ int ompi_coll_tuned_gather_intra_dec_fixed(void *sbuf, int scount,
|
||||
* internal allgather function.
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
||||
int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
int root, struct ompi_communicator_t *comm,
|
||||
mca_coll_base_module_t *module)
|
||||
{
|
||||
@ -747,7 +742,7 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
||||
int communicator_size, rank;
|
||||
size_t dsize, block_size;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,
|
||||
"ompi_coll_tuned_scatter_intra_dec_fixed"));
|
||||
|
||||
communicator_size = ompi_comm_size(comm);
|
||||
@ -759,15 +754,15 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(void *sbuf, int scount,
|
||||
} else {
|
||||
ompi_datatype_type_size(rdtype, &dsize);
|
||||
block_size = dsize * (ptrdiff_t)rcount;
|
||||
}
|
||||
}
|
||||
|
||||
if ((communicator_size > small_comm_size) &&
|
||||
(block_size < small_block_size)) {
|
||||
return ompi_coll_tuned_scatter_intra_binomial (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
return ompi_coll_tuned_scatter_intra_basic_linear (sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
return ompi_coll_base_scatter_intra_basic_linear(sbuf, scount, sdtype,
|
||||
rbuf, rcount, rdtype,
|
||||
root, comm, module);
|
||||
}
|
||||
|
@ -1,21 +1,20 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -28,7 +27,7 @@
|
||||
#include "coll_tuned.h"
|
||||
|
||||
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
|
||||
/* also need the dynamic rule structures */
|
||||
#include "coll_tuned_dynamic_rules.h"
|
||||
@ -43,7 +42,7 @@ static long getnext (FILE *fptr); /* local function */
|
||||
|
||||
static int fileline=0; /* used for verbose error messages */
|
||||
|
||||
/*
|
||||
/*
|
||||
* Reads a rule file called fname
|
||||
* Builds the algorithm rule table for a max of n_collectives
|
||||
*
|
||||
@ -151,7 +150,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
|
||||
for (ncs=0;ncs<NCS;ncs++) { /* for each comm size */
|
||||
|
||||
com_p = &(alg_p->com_rules[ncs]);
|
||||
|
||||
|
||||
CS = (int)getnext (fptr);
|
||||
if (CS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
|
||||
@ -165,7 +164,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n",
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n",
|
||||
NMS, CI, CS));
|
||||
com_p->n_msg_sizes = NMS;
|
||||
com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS);
|
||||
@ -222,7 +221,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %d\n", CI));
|
||||
|
||||
} /* per collective */
|
||||
|
||||
|
||||
fclose (fptr);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"\nConfigure file Stats\n"));
|
||||
@ -291,4 +290,3 @@ static long getnext (FILE *fptr)
|
||||
if ('#' == trash) skiptonewline (fptr);
|
||||
} while (1);
|
||||
}
|
||||
|
||||
|
@ -2,18 +2,18 @@
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 FUJITSU LIMITED. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -25,7 +25,7 @@
|
||||
#include "coll_tuned.h"
|
||||
|
||||
/* need to include our own topo prototypes so we can malloc data on the comm correctly */
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
|
||||
/* also need the dynamic rule structures */
|
||||
#include "coll_tuned_dynamic_rules.h"
|
||||
@ -33,7 +33,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "coll_tuned_util.h"
|
||||
#include "ompi/mca/coll/base/coll_base_util.h"
|
||||
|
||||
|
||||
ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
|
||||
@ -43,7 +43,7 @@ ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
|
||||
|
||||
alg_rules = (ompi_coll_alg_rule_t *) calloc (n_alg, sizeof (ompi_coll_alg_rule_t));
|
||||
if (!alg_rules) return (alg_rules);
|
||||
|
||||
|
||||
/* set all we can at this point */
|
||||
for (i=0;i<n_alg;i++) {
|
||||
alg_rules[i].alg_rule_id = i;
|
||||
@ -52,7 +52,7 @@ ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
|
||||
}
|
||||
|
||||
|
||||
ompi_coll_com_rule_t* ompi_coll_tuned_mk_com_rules (int n_com_rules, int alg_rule_id)
|
||||
ompi_coll_com_rule_t* ompi_coll_tuned_mk_com_rules (int n_com_rules, int alg_rule_id)
|
||||
{
|
||||
int i;
|
||||
ompi_coll_com_rule_t * com_rules;
|
||||
@ -95,9 +95,9 @@ ompi_coll_msg_rule_t* ompi_coll_tuned_mk_msg_rules (int n_msg_rules, int alg_rul
|
||||
|
||||
|
||||
/*
|
||||
* Debug / IO routines
|
||||
* Debug / IO routines
|
||||
*
|
||||
*/
|
||||
*/
|
||||
int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p)
|
||||
{
|
||||
if (!msg_p) {
|
||||
@ -105,11 +105,11 @@ int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p)
|
||||
return (-1);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\tcom_id %3d\tcom_size %3d\tmsg_id %3d\t", msg_p->alg_rule_id,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\tcom_id %3d\tcom_size %3d\tmsg_id %3d\t", msg_p->alg_rule_id,
|
||||
msg_p->com_rule_id, msg_p->mpi_comsize, msg_p->msg_rule_id));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %10lu -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\tmax_requests %4d\n",
|
||||
msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %10lu -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\tmax_requests %4d\n",
|
||||
msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize,
|
||||
msg_p->result_max_requests));
|
||||
|
||||
return (0);
|
||||
@ -268,7 +268,7 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* query functions
|
||||
* i.e. the functions that get me the algorithm, topo fanin/out and segment size fast
|
||||
* and also get the rules that are needed by each communicator as needed
|
||||
@ -277,7 +277,7 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
|
||||
|
||||
/*
|
||||
* This function is used to get the pointer to the nearest (less than or equal)
|
||||
* com rule for this MPI collective (alg_id) for a given
|
||||
* com rule for this MPI collective (alg_id) for a given
|
||||
* MPI communicator size. The complete rule base must be presented.
|
||||
*
|
||||
* If no rule exits returns NULL, else the com rule ptr
|
||||
@ -302,7 +302,7 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
|
||||
}
|
||||
|
||||
/* ok have some com sizes, now to find the one closest to my mpi_comsize */
|
||||
|
||||
|
||||
/* make a copy of the first com rule */
|
||||
best_com_p = com_p = alg_p->com_rules;
|
||||
i = best = 0;
|
||||
@ -324,13 +324,13 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
|
||||
return (best_com_p);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function takes a com_rule ptr (from the communicators coll tuned data structure)
|
||||
/*
|
||||
* This function takes a com_rule ptr (from the communicators coll tuned data structure)
|
||||
* (Which is chosen for a particular MPI collective)
|
||||
* and a (total_)msg_size and it returns (0) and a algorithm to use and a recommended topo faninout and segment size
|
||||
* all based on the user supplied rules
|
||||
*
|
||||
* Just like the above functions it uses a less than or equal msg size
|
||||
* Just like the above functions it uses a less than or equal msg size
|
||||
* (hense config file must have a default defined for '0' if we reach this point)
|
||||
* else if no rules match we return '0' + '0,0' or used fixed decision table with no topo chand and no segmentation
|
||||
* of users data.. shame.
|
||||
@ -339,7 +339,7 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
|
||||
*
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, size_t mpi_msgsize, int *result_topo_faninout,
|
||||
int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, size_t mpi_msgsize, int *result_topo_faninout,
|
||||
int* result_segsize, int* max_requests)
|
||||
{
|
||||
ompi_coll_msg_rule_t* msg_p = (ompi_coll_msg_rule_t*) NULL;
|
||||
@ -352,7 +352,7 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul
|
||||
}
|
||||
|
||||
/* ok have some msg sizes, now to find the one closest to my mpi_msgsize */
|
||||
|
||||
|
||||
/* make a copy of the first msg rule */
|
||||
best_msg_p = msg_p = base_com_rule->msg_rules;
|
||||
i = best = 0;
|
||||
@ -387,6 +387,5 @@ int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rul
|
||||
*max_requests = best_msg_p->result_max_requests;
|
||||
|
||||
/* return the algorithm/method to use */
|
||||
return (best_msg_p->result_alg);
|
||||
return (best_msg_p->result_alg);
|
||||
}
|
||||
|
||||
|
@ -2,18 +2,18 @@
|
||||
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2009 The University of Tennessee and The University
|
||||
* Copyright (c) 2004-2015 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -26,13 +26,13 @@
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/mca/coll/coll.h"
|
||||
#include "ompi/mca/coll/base/base.h"
|
||||
#include "ompi/mca/coll/base/coll_base_topo.h"
|
||||
#include "coll_tuned.h"
|
||||
#include "coll_tuned_topo.h"
|
||||
#include "coll_tuned_dynamic_rules.h"
|
||||
#include "coll_tuned_dynamic_file.h"
|
||||
|
||||
static int tuned_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm);
|
||||
struct ompi_communicator_t *comm);
|
||||
/*
|
||||
* Initial query function that is invoked during MPI_INIT, allowing
|
||||
* this component to disqualify itself if it doesn't support the
|
||||
@ -79,8 +79,8 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
|
||||
*priority = ompi_coll_tuned_priority;
|
||||
|
||||
/*
|
||||
* Choose whether to use [intra|inter] decision functions
|
||||
/*
|
||||
* Choose whether to use [intra|inter] decision functions
|
||||
* and if using fixed OR dynamic rule sets.
|
||||
* Right now you cannot mix them, maybe later on it can be changed
|
||||
* but this would probably add an extra if and funct call to the path
|
||||
@ -114,9 +114,9 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority)
|
||||
|
||||
/* We put all routines that handle the MCA user forced algorithm and parameter choices here */
|
||||
/* recheck the setting of forced, called on module create (i.e. for each new comm) */
|
||||
|
||||
|
||||
static int
|
||||
ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
|
||||
ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
|
||||
coll_tuned_force_algorithm_params_t *forced_values )
|
||||
{
|
||||
coll_tuned_force_algorithm_mca_param_indices_t* mca_params;
|
||||
@ -145,20 +145,20 @@ ompi_coll_tuned_forced_getvalues( enum COLLTYPE type,
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
#define COLL_TUNED_EXECUTE_IF_DYNAMIC(DATA, TYPE, EXECUTE) \
|
||||
#define COLL_TUNED_EXECUTE_IF_DYNAMIC(TMOD, TYPE, EXECUTE) \
|
||||
{ \
|
||||
int need_dynamic_decision = 0; \
|
||||
ompi_coll_tuned_forced_getvalues( (TYPE), &((DATA)->user_forced[(TYPE)]) ); \
|
||||
(DATA)->com_rules[(TYPE)] = NULL; \
|
||||
if( 0 != (DATA)->user_forced[(TYPE)].algorithm ) { \
|
||||
ompi_coll_tuned_forced_getvalues( (TYPE), &((TMOD)->user_forced[(TYPE)]) ); \
|
||||
(TMOD)->com_rules[(TYPE)] = NULL; \
|
||||
if( 0 != (TMOD)->user_forced[(TYPE)].algorithm ) { \
|
||||
need_dynamic_decision = 1; \
|
||||
EXECUTE; \
|
||||
} \
|
||||
if( NULL != mca_coll_tuned_component.all_base_rules ) { \
|
||||
(DATA)->com_rules[(TYPE)] \
|
||||
(TMOD)->com_rules[(TYPE)] \
|
||||
= ompi_coll_tuned_get_com_rule_ptr( mca_coll_tuned_component.all_base_rules, \
|
||||
(TYPE), size ); \
|
||||
if( NULL != (DATA)->com_rules[(TYPE)] ) { \
|
||||
if( NULL != (TMOD)->com_rules[(TYPE)] ) { \
|
||||
need_dynamic_decision = 1; \
|
||||
} \
|
||||
} \
|
||||
@ -178,7 +178,7 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
{
|
||||
int size;
|
||||
mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t *) module;
|
||||
mca_coll_tuned_comm_t *data = NULL;
|
||||
mca_coll_base_comm_t *data = NULL;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called."));
|
||||
|
||||
@ -191,32 +191,27 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
|
||||
/**
|
||||
* we still malloc data as it is used by the TUNED modules
|
||||
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
|
||||
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
|
||||
* we place any special info after the default data
|
||||
*
|
||||
* BUT on very large systems we might not be able to allocate all this memory so
|
||||
* we do check a MCA parameter to see if if we should allocate this memory
|
||||
*
|
||||
* The default is set very high
|
||||
*
|
||||
* The default is set very high
|
||||
*/
|
||||
|
||||
/* if we within the memory/size limit, allow preallocated data */
|
||||
data = OBJ_NEW(mca_coll_base_comm_t);
|
||||
if (NULL == data) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if( size <= ompi_coll_tuned_preallocate_memory_comm_size_limit ) {
|
||||
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t) +
|
||||
(sizeof(ompi_request_t *) * size * 2));
|
||||
if (NULL == data) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
data->mcct_reqs = (ompi_request_t **) (data + 1);
|
||||
data->mcct_num_reqs = size * 2;
|
||||
} else {
|
||||
data = (mca_coll_tuned_comm_t*)malloc(sizeof(struct mca_coll_tuned_comm_t));
|
||||
if (NULL == data) {
|
||||
data->mcct_reqs = (ompi_request_t**)malloc(sizeof(ompi_request_t*) * data->mcct_num_reqs);
|
||||
if (NULL == data->mcct_reqs) {
|
||||
OBJ_RELEASE(data);
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
data->mcct_reqs = (ompi_request_t **) NULL;
|
||||
data->mcct_num_reqs = 0;
|
||||
}
|
||||
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
@ -230,37 +225,37 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
* next dynamic state, recheck all forced rules as well
|
||||
* warning, we should check to make sure this is really an INTRA comm here...
|
||||
*/
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLGATHER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLGATHER,
|
||||
tuned_module->super.coll_allgather = ompi_coll_tuned_allgather_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLGATHERV,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLGATHERV,
|
||||
tuned_module->super.coll_allgatherv = ompi_coll_tuned_allgatherv_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLREDUCE,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLREDUCE,
|
||||
tuned_module->super.coll_allreduce = ompi_coll_tuned_allreduce_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALL,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLTOALL,
|
||||
tuned_module->super.coll_alltoall = ompi_coll_tuned_alltoall_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALLV,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLTOALLV,
|
||||
tuned_module->super.coll_alltoallv = ompi_coll_tuned_alltoallv_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, ALLTOALLW,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, ALLTOALLW,
|
||||
tuned_module->super.coll_alltoallw = NULL);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, BARRIER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, BARRIER,
|
||||
tuned_module->super.coll_barrier = ompi_coll_tuned_barrier_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, BCAST,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, BCAST,
|
||||
tuned_module->super.coll_bcast = ompi_coll_tuned_bcast_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, EXSCAN,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, EXSCAN,
|
||||
tuned_module->super.coll_exscan = NULL);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, GATHER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, GATHER,
|
||||
tuned_module->super.coll_gather = ompi_coll_tuned_gather_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, GATHERV,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, GATHERV,
|
||||
tuned_module->super.coll_gatherv = NULL);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, REDUCE,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, REDUCE,
|
||||
tuned_module->super.coll_reduce = ompi_coll_tuned_reduce_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, REDUCESCATTER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, REDUCESCATTER,
|
||||
tuned_module->super.coll_reduce_scatter = ompi_coll_tuned_reduce_scatter_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCAN,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCAN,
|
||||
tuned_module->super.coll_scan = NULL);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCATTER,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCATTER,
|
||||
tuned_module->super.coll_scatter = ompi_coll_tuned_scatter_intra_dec_dynamic);
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(data, SCATTERV,
|
||||
COLL_TUNED_EXECUTE_IF_DYNAMIC(tuned_module, SCATTERV,
|
||||
tuned_module->super.coll_scatterv = NULL);
|
||||
|
||||
if( false == ompi_coll_tuned_use_dynamic_rules ) {
|
||||
@ -269,7 +264,7 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
" decision by lack of dynamic rules"));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* general n fan out tree */
|
||||
data->cached_ntree = NULL;
|
||||
/* binary tree */
|
||||
@ -286,7 +281,7 @@ tuned_module_enable( mca_coll_base_module_t *module,
|
||||
data->cached_in_order_bintree = NULL;
|
||||
|
||||
/* All done */
|
||||
tuned_module->tuned_data = data;
|
||||
tuned_module->super.base_data = data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use"));
|
||||
return OMPI_SUCCESS;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user