Correct the bcast problem where we always did a bcast with segzise of 0.
Activate the reduce decision function. Others small updates (mostly TAB to spaces). This commit was SVN r12161.
Этот коммит содержится в:
родитель
50649dd6a9
Коммит
be27ee6fa0
@ -42,9 +42,9 @@
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int err;
|
||||
int rank;
|
||||
@ -97,16 +97,15 @@ ompi_coll_tuned_allreduce_intra_nonoverlapping(void *sbuf, void *rbuf, int count
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int err;
|
||||
int rank;
|
||||
|
||||
rank = ompi_comm_rank(comm);
|
||||
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_basic_linear rank %d", rank));
|
||||
|
||||
/* Reduce to 0 and broadcast. */
|
||||
@ -144,63 +143,63 @@ int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorith
|
||||
int rc;
|
||||
int max_alg = 2;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = max_alg;
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_count",
|
||||
"Number of allreduce algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_count",
|
||||
"Number of allreduce algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm",
|
||||
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm",
|
||||
"Which allreduce algorithm is used. Can be locked down to any of: 0 ignore, 1 basic linear, 2 nonoverlapping (tuned reduce + tuned bcast)",
|
||||
false, false, 0, NULL);
|
||||
|
||||
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for allreduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false, ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for allreduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false, ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(
|
||||
&mca_coll_tuned_component.super.collm_version,
|
||||
"allreduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for allreduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_allreduce_intra_do_forced(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
|
||||
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
|
||||
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm);
|
||||
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
|
||||
comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm,
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
@ -208,25 +207,23 @@ switch (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
|
||||
|
||||
|
||||
int ompi_coll_tuned_allreduce_intra_do_this(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
int algorithm, int faninout, int segsize)
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
|
||||
case (1): return ompi_coll_tuned_allreduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, comm);
|
||||
case (2): return ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
@ -31,10 +31,10 @@
|
||||
#include "coll_tuned_util.h"
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int line = -1, err = 0;
|
||||
int rank, size, step;
|
||||
@ -54,37 +54,37 @@ int ompi_coll_tuned_alltoall_intra_pairwise(void *sbuf, int scount,
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
|
||||
/* Perform pairwise exchange - starting from 1 so the local copy is last */
|
||||
for (step = 1; step < size+1; step++) {
|
||||
/* Perform pairwise exchange - starting from 1 so the local copy is last */
|
||||
for (step = 1; step < size+1; step++) {
|
||||
|
||||
/* who do we talk to in this step? */
|
||||
sendto = (rank+step)%size;
|
||||
recvfrom = (rank+size-step)%size;
|
||||
/* who do we talk to in this step? */
|
||||
sendto = (rank+step)%size;
|
||||
recvfrom = (rank+size-step)%size;
|
||||
|
||||
/* where from are we sending and where from are we receiving actual data ? */
|
||||
tmpsend = (char*)sbuf+sendto*sext*scount;
|
||||
tmprecv = (char*)rbuf+recvfrom*rext*rcount;
|
||||
/* where from are we sending and where from are we receiving actual data ? */
|
||||
tmpsend = (char*)sbuf+sendto*sext*scount;
|
||||
tmprecv = (char*)rbuf+recvfrom*rext*rcount;
|
||||
|
||||
/* send and receive */
|
||||
err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
/* send and receive */
|
||||
err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
return MPI_SUCCESS;
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return err;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int i, k, line = -1;
|
||||
int rank, size;
|
||||
@ -145,107 +145,107 @@ int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount,
|
||||
err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) ((size-rank)*scount),
|
||||
tmpbuf, ((char*)sbuf)+rank*scount*sext);
|
||||
if (err<0) {
|
||||
line = __LINE__; err = -1; goto err_hndl;
|
||||
line = __LINE__; err = -1; goto err_hndl;
|
||||
}
|
||||
|
||||
if (rank != 0) {
|
||||
err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) (rank*scount),
|
||||
tmpbuf+(size-rank)*scount*sext, (char*)sbuf);
|
||||
if (err<0) {
|
||||
line = __LINE__; err = -1; goto err_hndl;
|
||||
}
|
||||
}
|
||||
|
||||
/* perform communication step */
|
||||
for (distance = 1; distance < size; distance<<=1) {
|
||||
|
||||
/* send data to "sendto" */
|
||||
sendto = (rank+distance)%size;
|
||||
recvfrom = (rank-distance+size)%size;
|
||||
packsize = 0;
|
||||
k = 0;
|
||||
|
||||
/* create indexed datatype */
|
||||
for (i = 1; i < size; i++) {
|
||||
if ((i&distance) == distance) {
|
||||
displs[k] = i*scount; blen[k] = scount;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
/* Set indexes and displacements */
|
||||
err = MPI_Type_indexed(k, blen, displs, sdtype, &iddt);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
/* Commit the new datatype */
|
||||
err = MPI_Type_commit(&iddt);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* have the new distribution ddt, pack and exchange data */
|
||||
err = MPI_Pack(tmpbuf, 1, iddt, packbuf, maxpacksize, &packsize, comm);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv ( packbuf, packsize, MPI_PACKED, sendto,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
rbuf, packsize, MPI_PACKED, recvfrom,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Unpack data from rbuf to tmpbuf */
|
||||
position = 0;
|
||||
err = MPI_Unpack(rbuf, packsize, &position,
|
||||
tmpbuf, 1, iddt, comm);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* free ddt */
|
||||
err = MPI_Type_free(&iddt);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
} /* end of for (distance = 1... */
|
||||
|
||||
/* Step 3 - local rotation - */
|
||||
for (i = 0; i < size; i++) {
|
||||
|
||||
err = ompi_ddt_copy_content_same_ddt (rdtype, (int32_t) rcount,
|
||||
((char*)rbuf)+(((rank-i+size)%size)*rcount*rext),
|
||||
tmpbuf+i*rcount*rext);
|
||||
tmpbuf+(size-rank)*scount*sext, (char*)sbuf);
|
||||
if (err<0) {
|
||||
line = __LINE__; err = -1; goto err_hndl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* perform communication step */
|
||||
for (distance = 1; distance < size; distance<<=1) {
|
||||
|
||||
/* send data to "sendto" */
|
||||
sendto = (rank+distance)%size;
|
||||
recvfrom = (rank-distance+size)%size;
|
||||
packsize = 0;
|
||||
k = 0;
|
||||
|
||||
/* create indexed datatype */
|
||||
for (i = 1; i < size; i++) {
|
||||
if ((i&distance) == distance) {
|
||||
displs[k] = i*scount; blen[k] = scount;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
/* Set indexes and displacements */
|
||||
err = MPI_Type_indexed(k, blen, displs, sdtype, &iddt);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
/* Commit the new datatype */
|
||||
err = MPI_Type_commit(&iddt);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* have the new distribution ddt, pack and exchange data */
|
||||
err = MPI_Pack(tmpbuf, 1, iddt, packbuf, maxpacksize, &packsize, comm);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Sendreceive */
|
||||
err = ompi_coll_tuned_sendrecv ( packbuf, packsize, MPI_PACKED, sendto,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
rbuf, packsize, MPI_PACKED, recvfrom,
|
||||
MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* Unpack data from rbuf to tmpbuf */
|
||||
position = 0;
|
||||
err = MPI_Unpack(rbuf, packsize, &position,
|
||||
tmpbuf, 1, iddt, comm);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* free ddt */
|
||||
err = MPI_Type_free(&iddt);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
} /* end of for (distance = 1... */
|
||||
|
||||
/* Step 3 - local rotation - */
|
||||
for (i = 0; i < size; i++) {
|
||||
|
||||
err = ompi_ddt_copy_content_same_ddt (rdtype, (int32_t) rcount,
|
||||
((char*)rbuf)+(((rank-i+size)%size)*rcount*rext),
|
||||
tmpbuf+i*rcount*rext);
|
||||
if (err<0) {
|
||||
line = __LINE__; err = -1; goto err_hndl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (err<0) {
|
||||
line = __LINE__; err = -1; goto err_hndl;
|
||||
line = __LINE__; err = -1; goto err_hndl;
|
||||
}
|
||||
|
||||
/* Step 4 - clean up */
|
||||
if (tmpbuf != NULL) free(tmpbuf);
|
||||
if (packbuf != NULL) free(packbuf);
|
||||
if (weallocated) {
|
||||
if (displs != NULL) free(displs);
|
||||
if (blen != NULL) free(blen);
|
||||
}
|
||||
/* Step 4 - clean up */
|
||||
if (tmpbuf != NULL) free(tmpbuf);
|
||||
if (packbuf != NULL) free(packbuf);
|
||||
if (weallocated) {
|
||||
if (displs != NULL) free(displs);
|
||||
if (blen != NULL) free(blen);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
if (tmpbuf != NULL) free(tmpbuf);
|
||||
if (packbuf != NULL) free(packbuf);
|
||||
if (weallocated) {
|
||||
if (displs != NULL) free(displs);
|
||||
if (blen != NULL) free(blen);
|
||||
}
|
||||
return err;
|
||||
if (tmpbuf != NULL) free(tmpbuf);
|
||||
if (packbuf != NULL) free(packbuf);
|
||||
if (weallocated) {
|
||||
if (displs != NULL) free(displs);
|
||||
if (blen != NULL) free(blen);
|
||||
}
|
||||
return err;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int line = -1, err = 0;
|
||||
int rank;
|
||||
@ -273,8 +273,8 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
|
||||
/* send and receive */
|
||||
err = ompi_coll_tuned_sendrecv ( tmpsend, scount, sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
comm, MPI_STATUS_IGNORE, rank );
|
||||
tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL,
|
||||
comm, MPI_STATUS_IGNORE, rank );
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
|
||||
/* ddt sendrecv your own data */
|
||||
@ -287,7 +287,7 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return err;
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
@ -311,10 +311,10 @@ int ompi_coll_tuned_alltoall_intra_two_procs(void *sbuf, int scount,
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_basic_linear(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int i;
|
||||
int rank;
|
||||
@ -443,51 +443,51 @@ int ompi_coll_tuned_alltoall_intra_check_forced_init (coll_tuned_force_algorithm
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[ALLTOALL] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_count",
|
||||
"Number of alltoall algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_count",
|
||||
"Number of alltoall algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm",
|
||||
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm",
|
||||
"Which alltoall algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 pairwise, 3: modified bruck, 4: two proc only.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for alltoall algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for alltoall algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_chain_fanout",
|
||||
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"alltoall_algorithm_chain_fanout",
|
||||
"Fanout for chains used for alltoall algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_do_forced(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm));
|
||||
comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
|
||||
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
@ -495,7 +495,7 @@ switch (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
|
||||
case (4): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
|
||||
comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
@ -503,16 +503,16 @@ switch (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
|
||||
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_do_this(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
int algorithm, int faninout, int segsize)
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
case (1): return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
case (2): return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
@ -520,7 +520,7 @@ switch (algorithm) {
|
||||
case (4): return ompi_coll_tuned_alltoall_intra_two_procs (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[ALLTOALL]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
|
@ -65,7 +65,7 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm)
|
||||
|
||||
if (rank > 0) { /* receive message from the left */
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
@ -77,14 +77,14 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm)
|
||||
/* root needs to receive from the last node */
|
||||
if (rank == 0) {
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
/* Allow nodes to exit */
|
||||
if (rank > 0) { /* post Receive from left */
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
@ -96,15 +96,15 @@ int ompi_coll_tuned_barrier_intra_doublering(struct ompi_communicator_t *comm)
|
||||
/* rank 0 post receive from the last node */
|
||||
if (rank == 0) {
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
|
||||
}
|
||||
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return err;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
@ -131,13 +131,13 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
if (rank >= adjsize) {
|
||||
/* send message to lower ranked node */
|
||||
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, rank-adjsize,
|
||||
MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
|
||||
/* post receive from lower ranked node */
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank-adjsize,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
|
||||
@ -145,7 +145,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
|
||||
/* receive message from high level rank */
|
||||
err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize,
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE));
|
||||
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
}
|
||||
@ -160,8 +160,8 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
if (remote >= adjsize) continue;
|
||||
|
||||
err = ompi_coll_tuned_sendrecv_localcompleted (NULL, 0, MPI_BYTE, remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
NULL, 0, MPI_BYTE, remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
NULL, 0, MPI_BYTE, remote, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
}
|
||||
@ -173,7 +173,7 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
|
||||
/* send enter message to higher ranked node */
|
||||
err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, rank+adjsize,
|
||||
MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
|
||||
MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_SYNCHRONOUS, comm));
|
||||
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
}
|
||||
@ -181,9 +181,9 @@ int ompi_coll_tuned_barrier_intra_recursivedoubling(struct ompi_communicator_t *
|
||||
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return err;
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
@ -206,16 +206,16 @@ int ompi_coll_tuned_barrier_intra_bruck(struct ompi_communicator_t *comm)
|
||||
from = (rank + size - distance)%size;
|
||||
to = (rank + distance)%size;
|
||||
err = ompi_coll_tuned_sendrecv_localcompleted (NULL, 0, MPI_BYTE, to, MCA_COLL_BASE_TAG_BARRIER,
|
||||
NULL, 0, MPI_BYTE, from, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
NULL, 0, MPI_BYTE, from, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
|
||||
}
|
||||
|
||||
return MPI_SUCCESS;
|
||||
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return err;
|
||||
err_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
@ -233,13 +233,13 @@ int ompi_coll_tuned_barrier_intra_two_procs(struct ompi_communicator_t *comm)
|
||||
|
||||
if (0==rank) {
|
||||
err = ompi_coll_tuned_sendrecv_localcompleted (NULL, 0, MPI_BYTE, 1, MCA_COLL_BASE_TAG_BARRIER,
|
||||
NULL, 0, MPI_BYTE, 1, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
NULL, 0, MPI_BYTE, 1, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
}
|
||||
else {
|
||||
err = ompi_coll_tuned_sendrecv_localcompleted (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER,
|
||||
NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
}
|
||||
|
||||
return (err);
|
||||
@ -334,39 +334,39 @@ int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_
|
||||
int rc;
|
||||
int max_alg = 5;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[BARRIER] = max_alg;
|
||||
ompi_coll_tuned_forced_max_algorithms[BARRIER] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm_count",
|
||||
"Number of barrier algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm_count",
|
||||
"Number of barrier algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm",
|
||||
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"barrier_algorithm",
|
||||
"Which barrier algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 double ring, 3: recursive doubling 4: bruck, 5: two proc only",
|
||||
false, false, 0, NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_do_forced(struct ompi_communicator_t *comm)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[BARRIER].algorithm));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[BARRIER].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
|
||||
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
|
||||
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm);
|
||||
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm);
|
||||
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm);
|
||||
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm);
|
||||
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm);
|
||||
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
|
||||
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
comm->c_coll_selected_data->user_forced[BARRIER].algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
comm->c_coll_selected_data->user_forced[BARRIER].algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
@ -375,19 +375,19 @@ switch (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
|
||||
|
||||
int ompi_coll_tuned_barrier_intra_do_this (struct ompi_communicator_t *comm, int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this selected algorithm %d topo fanin/out%d", algorithm, faninout));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
|
||||
case (1): return ompi_coll_tuned_barrier_intra_basic_linear (comm);
|
||||
case (2): return ompi_coll_tuned_barrier_intra_doublering (comm);
|
||||
case (3): return ompi_coll_tuned_barrier_intra_recursivedoubling (comm);
|
||||
case (4): return ompi_coll_tuned_barrier_intra_bruck (comm);
|
||||
case (5): return ompi_coll_tuned_barrier_intra_two_procs (comm);
|
||||
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
|
||||
/* case (6): return ompi_coll_tuned_barrier_intra_bmtree_step (comm); */
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:barrier_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[BARRIER]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
|
@ -31,10 +31,10 @@
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_chain ( void *buff, int count,
|
||||
struct ompi_datatype_t *datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
uint32_t segsize, int32_t chains )
|
||||
struct ompi_datatype_t *datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
uint32_t segsize, int32_t chains )
|
||||
{
|
||||
int err = 0, line, rank, size, segindex, i;
|
||||
int segcount; /* Number of elements sent with each segment */
|
||||
@ -111,7 +111,7 @@ ompi_coll_tuned_bcast_intra_chain ( void *buff, int count,
|
||||
/* set the buffer pointer */
|
||||
tmpbuf = (char *)buff;
|
||||
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,("%1d chain root %d num_segments %d\n", rank, root, num_segments); */
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,("%1d chain root %d num_segments %d\n", rank, root, num_segments); */
|
||||
|
||||
/* root code */
|
||||
if( rank == root ) {
|
||||
@ -141,8 +141,8 @@ ompi_coll_tuned_bcast_intra_chain ( void *buff, int count,
|
||||
*/
|
||||
new_sendcount = sendcount = segcount;
|
||||
err = MCA_PML_CALL(irecv( tmpbuf, sendcount, datatype,
|
||||
chain->chain_prev, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, &base_req));
|
||||
chain->chain_prev, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, &base_req));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
|
||||
for( segindex = 1; segindex < num_segments; segindex++ ) {
|
||||
@ -212,29 +212,29 @@ ompi_coll_tuned_bcast_intra_chain ( void *buff, int count,
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_pipeline ( void *buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t *datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
uint32_t segsize )
|
||||
int count,
|
||||
struct ompi_datatype_t *datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
uint32_t segsize )
|
||||
{
|
||||
int rank; /* remove when removing print statement */
|
||||
rank = ompi_comm_rank(comm); /* remove when removing print statement */
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_pipeline rank %d root %d ss %5d", rank, root, segsize));
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype, root, comm,
|
||||
segsize, 1 );
|
||||
segsize, 1 );
|
||||
}
|
||||
|
||||
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
uint32_t segsize )
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
uint32_t segsize )
|
||||
{
|
||||
int err=0, line;
|
||||
int rank, size;
|
||||
@ -307,7 +307,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
(segsize > counts[1] * type_size) ) {
|
||||
/* call linear version here ! */
|
||||
return (ompi_coll_tuned_bcast_intra_chain ( buffer, count, datatype,
|
||||
root, comm, segsize, 1 ));
|
||||
root, comm, segsize, 1 ));
|
||||
}
|
||||
|
||||
err = ompi_ddt_get_extent (datatype, &lb, &type_extent);
|
||||
@ -349,7 +349,7 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
/* update tmp buffer */
|
||||
/* update tmp buffer */
|
||||
tmpbuf[i] += realsegsize[i];
|
||||
}
|
||||
}
|
||||
@ -448,10 +448,10 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
if ( (size%2) != 0 && rank != root) {
|
||||
|
||||
err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
} else if ( (size%2) == 0 ) {
|
||||
/* root sends right buffer to the last node */
|
||||
@ -472,17 +472,17 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
/* everyone else exchanges buffers */
|
||||
else {
|
||||
err = ompi_coll_tuned_sendrecv( tmpbuf[lr], counts[lr], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
tmpbuf[(lr+1)%2], counts[(lr+1)%2], datatype,
|
||||
pair, MCA_COLL_BASE_TAG_BCAST,
|
||||
comm, MPI_STATUS_IGNORE, rank);
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
}
|
||||
return (MPI_SUCCESS);
|
||||
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return (err);
|
||||
}
|
||||
|
||||
@ -491,11 +491,11 @@ ompi_coll_tuned_bcast_intra_split_bintree ( void* buffer,
|
||||
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
uint32_t segsize )
|
||||
int count,
|
||||
struct ompi_datatype_t* datatype,
|
||||
int root,
|
||||
struct ompi_communicator_t* comm,
|
||||
uint32_t segsize )
|
||||
{
|
||||
int err=0, line, i;
|
||||
int rank, size;
|
||||
@ -588,8 +588,8 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */
|
||||
/* send data */
|
||||
MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
|
||||
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, &send_reqs[i]));
|
||||
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, &send_reqs[i]));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
|
||||
@ -639,8 +639,8 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */
|
||||
/* send data */
|
||||
MCA_PML_CALL(isend(tmpbuf, segcount, datatype,
|
||||
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, &send_reqs[i]));
|
||||
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, &send_reqs[i]));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
|
||||
@ -661,8 +661,8 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
|
||||
for( i = 0; i < tree->tree_nextsize; i++ ) { /* send data to children */
|
||||
MCA_PML_CALL(isend(tmpbuf, sendcount, datatype,
|
||||
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, &send_reqs[i]));
|
||||
tree->tree_next[i], MCA_COLL_BASE_TAG_BCAST,
|
||||
MCA_PML_BASE_SEND_STANDARD, comm, &send_reqs[i]));
|
||||
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
|
||||
@ -692,7 +692,7 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
return (MPI_SUCCESS);
|
||||
|
||||
error_hndl:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
|
||||
return (err);
|
||||
}
|
||||
|
||||
@ -720,8 +720,8 @@ ompi_coll_tuned_bcast_intra_bintree ( void* buffer,
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int i;
|
||||
int size;
|
||||
@ -735,7 +735,6 @@ ompi_coll_tuned_bcast_intra_basic_linear (void *buff, int count,
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_basic_linear rank %d root %d", rank, root));
|
||||
|
||||
|
||||
/* Non-root receive the data. */
|
||||
|
||||
if (rank != root) {
|
||||
@ -800,67 +799,67 @@ int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mc
|
||||
int rc;
|
||||
int max_alg = 5;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[BCAST] = max_alg;
|
||||
ompi_coll_tuned_forced_max_algorithms[BCAST] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_count",
|
||||
"Number of bcast algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_count",
|
||||
"Number of bcast algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm",
|
||||
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree.",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm",
|
||||
"Which bcast algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 chain, 3: pipeline, 4: split binary tree, 5: binary tree.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for bcast algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_chain_fanout",
|
||||
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"bcast_algorithm_chain_fanout",
|
||||
"Fanout for chains used for bcast algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_bcast_intra_do_forced(void *buf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *dtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[BCAST].algorithm));
|
||||
comm->c_coll_selected_data->user_forced[BCAST].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
|
||||
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm);
|
||||
case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm);
|
||||
case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm,
|
||||
comm->c_coll_selected_data->user_forced[BCAST].segsize,
|
||||
comm->c_coll_selected_data->user_forced[BCAST].chain_fanout );
|
||||
comm->c_coll_selected_data->user_forced[BCAST].segsize,
|
||||
comm->c_coll_selected_data->user_forced[BCAST].chain_fanout );
|
||||
case (3): return ompi_coll_tuned_bcast_intra_pipeline (buf, count, dtype, root, comm,
|
||||
comm->c_coll_selected_data->user_forced[BCAST].segsize);
|
||||
comm->c_coll_selected_data->user_forced[BCAST].segsize);
|
||||
case (4): return ompi_coll_tuned_bcast_intra_split_bintree (buf, count, dtype, root, comm,
|
||||
comm->c_coll_selected_data->user_forced[BCAST].segsize);
|
||||
comm->c_coll_selected_data->user_forced[BCAST].segsize);
|
||||
case (5): return ompi_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm,
|
||||
comm->c_coll_selected_data->user_forced[BCAST].segsize);
|
||||
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
|
||||
* ompi_coll_tuned_bcast_forced_segsize); */
|
||||
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
|
||||
* ompi_coll_tuned_bcast_forced_segsize); */
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
comm->c_coll_selected_data->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
|
||||
comm->c_coll_selected_data->user_forced[BCAST].algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
@ -868,27 +867,27 @@ switch (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
|
||||
|
||||
|
||||
int ompi_coll_tuned_bcast_intra_do_this(void *buf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
int algorithm, int faninout, int segsize)
|
||||
struct ompi_datatype_t *dtype,
|
||||
int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
int algorithm, int faninout, int segsize)
|
||||
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_bcast_intra_dec_fixed (buf, count, dtype, root, comm);
|
||||
case (1): return ompi_coll_tuned_bcast_intra_basic_linear (buf, count, dtype, root, comm);
|
||||
case (2): return ompi_coll_tuned_bcast_intra_chain (buf, count, dtype, root, comm, segsize, faninout );
|
||||
case (3): return ompi_coll_tuned_bcast_intra_pipeline (buf, count, dtype, root, comm, segsize);
|
||||
case (4): return ompi_coll_tuned_bcast_intra_split_bintree (buf, count, dtype, root, comm, segsize);
|
||||
case (5): return ompi_coll_tuned_bcast_intra_bintree (buf, count, dtype, root, comm, segsize);
|
||||
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
|
||||
* segsize); */
|
||||
/* case (6): return ompi_coll_tuned_bcast_intra_bmtree (buf, count, dtype, root, comm,
|
||||
* segsize); */
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:bcast_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[BCAST]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
|
@ -72,48 +72,48 @@ mca_coll_tuned_component_t mca_coll_tuned_component = {
|
||||
|
||||
{
|
||||
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
/* First, the mca_component_t struct containing meta information
|
||||
about the component itself */
|
||||
|
||||
{
|
||||
/* Indicate that we are a coll v1.0.0 component (which also implies a
|
||||
specific MCA version) */
|
||||
{
|
||||
/* Indicate that we are a coll v1.0.0 component (which also implies a
|
||||
specific MCA version) */
|
||||
|
||||
MCA_COLL_BASE_VERSION_1_0_0,
|
||||
MCA_COLL_BASE_VERSION_1_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
/* Component name and version */
|
||||
|
||||
"tuned",
|
||||
OMPI_MAJOR_VERSION,
|
||||
OMPI_MINOR_VERSION,
|
||||
OMPI_RELEASE_VERSION,
|
||||
"tuned",
|
||||
OMPI_MAJOR_VERSION,
|
||||
OMPI_MINOR_VERSION,
|
||||
OMPI_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
/* Component open and close functions */
|
||||
|
||||
tuned_open,
|
||||
tuned_close
|
||||
},
|
||||
tuned_open,
|
||||
tuned_close
|
||||
},
|
||||
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
/* Next the MCA v1.0.0 component meta data */
|
||||
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
{
|
||||
/* Whether the component is checkpointable or not */
|
||||
|
||||
true
|
||||
},
|
||||
true
|
||||
},
|
||||
|
||||
/* Initialization / querying functions */
|
||||
/* Initialization / querying functions */
|
||||
|
||||
ompi_coll_tuned_init_query,
|
||||
ompi_coll_tuned_comm_query,
|
||||
NULL
|
||||
},
|
||||
ompi_coll_tuned_init_query,
|
||||
ompi_coll_tuned_comm_query,
|
||||
NULL
|
||||
},
|
||||
|
||||
/* priority of the module */
|
||||
0,
|
||||
|
||||
/* Tuned component specific information */
|
||||
/* Note some of this WAS in the module */
|
||||
/* Tuned component specific information */
|
||||
/* Note some of this WAS in the module */
|
||||
NULL /* ompi_coll_alg_rule_t ptr */
|
||||
};
|
||||
|
||||
@ -122,7 +122,7 @@ static int tuned_open(void)
|
||||
{
|
||||
int param;
|
||||
|
||||
/* ompi_coll_tuned_component_t *ct = &ompi_coll_tuned_component; */
|
||||
/* ompi_coll_tuned_component_t *ct = &ompi_coll_tuned_component; */
|
||||
|
||||
/* Use a low priority, but allow other components to be lower */
|
||||
|
||||
@ -149,13 +149,13 @@ static int tuned_open(void)
|
||||
|
||||
/* if dynamic rules allowed then look up dynamic rules config filename, else we leave it an empty filename (NULL) */
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
/* char *default_name; */
|
||||
/* asprintf(&default_name, "~/.openmpi/openmpi-coll-tuned-params.conf"); */
|
||||
/* char *default_name; */
|
||||
/* asprintf(&default_name, "~/.openmpi/openmpi-coll-tuned-params.conf"); */
|
||||
mca_base_param_reg_string(&mca_coll_tuned_component.super.collm_version,
|
||||
"dynamic_rules_filename",
|
||||
"Filename of configuration file that contains the dynamic (@runtime) decision function rules",
|
||||
false, false, ompi_coll_tuned_dynamic_rules_filename,
|
||||
&ompi_coll_tuned_dynamic_rules_filename);
|
||||
"dynamic_rules_filename",
|
||||
"Filename of configuration file that contains the dynamic (@runtime) decision function rules",
|
||||
false, false, ompi_coll_tuned_dynamic_rules_filename,
|
||||
&ompi_coll_tuned_dynamic_rules_filename);
|
||||
}
|
||||
|
||||
/* some initial guesses at topology parameters */
|
||||
@ -176,7 +176,7 @@ static int tuned_open(void)
|
||||
int verbose;
|
||||
mca_base_param_lookup_int(param, &verbose);
|
||||
if (verbose > 0) {
|
||||
ompi_coll_tuned_stream = opal_output_open(NULL);
|
||||
ompi_coll_tuned_stream = opal_output_open(NULL);
|
||||
}
|
||||
}
|
||||
|
||||
@ -190,7 +190,7 @@ static int tuned_open(void)
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]);
|
||||
ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]);
|
||||
/* ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
|
||||
/* ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALLV]); */
|
||||
ompi_coll_tuned_barrier_intra_check_forced_init(&ompi_coll_tuned_forced_params[BARRIER]);
|
||||
ompi_coll_tuned_bcast_intra_check_forced_init(&ompi_coll_tuned_forced_params[BCAST]);
|
||||
ompi_coll_tuned_reduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[REDUCE]);
|
||||
|
@ -54,9 +54,9 @@
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_dynamic"));
|
||||
@ -64,7 +64,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
/* check to see if we have some filebased rules */
|
||||
if (comm->c_coll_selected_data->com_rules[ALLREDUCE]) {
|
||||
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize;
|
||||
size_t dsize;
|
||||
|
||||
@ -72,20 +72,18 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (comm->c_coll_selected_data->com_rules[ALLREDUCE],
|
||||
dsize, &faninout, &segsize);
|
||||
dsize, &faninout, &segsize);
|
||||
|
||||
if (alg) { /* we have found a valid choice from the file based rules for this message size */
|
||||
return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op, comm,
|
||||
alg, faninout, segsize);
|
||||
alg, faninout, segsize);
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (comm->c_coll_selected_data->user_forced[ALLREDUCE].algorithm) {
|
||||
return ompi_coll_tuned_allreduce_intra_do_forced (sbuf, rbuf, count, dtype, op, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
|
||||
}
|
||||
return ompi_coll_tuned_allreduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, comm);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -97,10 +95,10 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (void *sbuf, void *rbuf, int count,
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_dynamic"));
|
||||
@ -108,7 +106,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
/* check to see if we have some filebased rules */
|
||||
if (comm->c_coll_selected_data->com_rules[ALLTOALL]) {
|
||||
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int comsize;
|
||||
int alg, faninout, segsize;
|
||||
size_t dsize;
|
||||
@ -118,11 +116,11 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
dsize *= comsize * scount;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (comm->c_coll_selected_data->com_rules[ALLTOALL],
|
||||
dsize, &faninout, &segsize);
|
||||
dsize, &faninout, &segsize);
|
||||
|
||||
if (alg) { /* we have found a valid choice from the file based rules for this message size */
|
||||
return ompi_coll_tuned_alltoall_intra_do_this (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm,
|
||||
alg, faninout, segsize);
|
||||
alg, faninout, segsize);
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
@ -130,9 +128,7 @@ int ompi_coll_tuned_alltoall_intra_dec_dynamic(void *sbuf, int scount,
|
||||
if (comm->c_coll_selected_data->user_forced[ALLTOALL].algorithm) {
|
||||
return ompi_coll_tuned_alltoall_intra_do_forced (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
return ompi_coll_tuned_alltoall_intra_dec_fixed (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -150,25 +146,22 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm)
|
||||
/* check to see if we have some filebased rules */
|
||||
if (comm->c_coll_selected_data->com_rules[BARRIER]) {
|
||||
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (comm->c_coll_selected_data->com_rules[BARRIER],
|
||||
0, &faninout, &segsize);
|
||||
0, &faninout, &segsize);
|
||||
|
||||
if (alg) { /* we have found a valid choice from the file based rules for this message size */
|
||||
return ompi_coll_tuned_barrier_intra_do_this (comm,
|
||||
alg, faninout, segsize);
|
||||
alg, faninout, segsize);
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (comm->c_coll_selected_data->user_forced[BARRIER].algorithm) {
|
||||
return ompi_coll_tuned_barrier_intra_do_forced (comm);
|
||||
return ompi_coll_tuned_barrier_intra_do_forced (comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_barrier_intra_dec_fixed (comm);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -179,8 +172,8 @@ int ompi_coll_tuned_barrier_intra_dec_dynamic(struct ompi_communicator_t *comm)
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the bcast implementation)
|
||||
*/
|
||||
int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:bcast_intra_dec_dynamic"));
|
||||
@ -188,7 +181,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
/* check to see if we have some filebased rules */
|
||||
if (comm->c_coll_selected_data->com_rules[BCAST]) {
|
||||
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize;
|
||||
size_t dsize;
|
||||
|
||||
@ -196,7 +189,7 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (comm->c_coll_selected_data->com_rules[BCAST],
|
||||
dsize, &faninout, &segsize);
|
||||
dsize, &faninout, &segsize);
|
||||
|
||||
if (alg) { /* we have found a valid choice from the file based rules for this message size */
|
||||
return ompi_coll_tuned_bcast_intra_do_this (buff, count, datatype, root, comm,
|
||||
@ -206,12 +199,9 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
|
||||
|
||||
if (comm->c_coll_selected_data->user_forced[BCAST].algorithm) {
|
||||
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root, comm);
|
||||
return ompi_coll_tuned_bcast_intra_do_forced (buff, count, datatype, root, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_bcast_intra_dec_fixed (buff, count, datatype, root, comm);
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_bcast_intra_dec_fixed (buff, count, datatype, root, comm);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -223,9 +213,9 @@ int ompi_coll_tuned_bcast_intra_dec_dynamic(void *buff, int count,
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
int count, struct ompi_datatype_t* datatype,
|
||||
struct ompi_op_t* op, int root,
|
||||
struct ompi_communicator_t* comm)
|
||||
int count, struct ompi_datatype_t* datatype,
|
||||
struct ompi_op_t* op, int root,
|
||||
struct ompi_communicator_t* comm)
|
||||
{
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_intra_dec_dynamic"));
|
||||
@ -233,7 +223,7 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
/* check to see if we have some filebased rules */
|
||||
if (comm->c_coll_selected_data->com_rules[REDUCE]) {
|
||||
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
/* we do, so calc the message size or what ever we need and use this for the evaluation */
|
||||
int alg, faninout, segsize;
|
||||
size_t dsize;
|
||||
|
||||
@ -241,20 +231,17 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( void *sendbuf, void *recvbuf,
|
||||
dsize *= count;
|
||||
|
||||
alg = ompi_coll_tuned_get_target_method_params (comm->c_coll_selected_data->com_rules[REDUCE],
|
||||
dsize, &faninout, &segsize);
|
||||
dsize, &faninout, &segsize);
|
||||
|
||||
if (alg) { /* we have found a valid choice from the file based rules for this message size */
|
||||
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype, op, root, comm,
|
||||
alg, faninout, segsize);
|
||||
alg, faninout, segsize);
|
||||
} /* found a method */
|
||||
} /*end if any com rules to check */
|
||||
|
||||
if (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
|
||||
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
return ompi_coll_tuned_reduce_intra_do_forced (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_reduce_intra_dec_fixed (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
}
|
||||
|
||||
return ompi_coll_tuned_reduce_intra_dec_fixed (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
}
|
||||
|
||||
|
@ -37,21 +37,13 @@
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
/* int size; */
|
||||
/* int contig; */
|
||||
/* int dsize; */
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_allreduce_intra_dec_fixed"));
|
||||
|
||||
/* size = ompi_comm_size(comm); */
|
||||
|
||||
return (ompi_coll_tuned_allreduce_intra_nonoverlapping (sbuf, rbuf, count, dtype, op, comm));
|
||||
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
@ -63,16 +55,13 @@ ompi_coll_tuned_allreduce_intra_dec_fixed (void *sbuf, void *rbuf, int count,
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *sdtype,
|
||||
void* rbuf, int rcount,
|
||||
struct ompi_datatype_t *rdtype,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int comsize;
|
||||
int rank;
|
||||
int err;
|
||||
unsigned long dsize;
|
||||
unsigned long total_dsize;
|
||||
int comsize, rank, err;
|
||||
size_t dsize, total_dsize;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_alltoall_intra_dec_fixed"));
|
||||
|
||||
@ -87,21 +76,19 @@ int ompi_coll_tuned_alltoall_intra_dec_fixed(void *sbuf, int scount,
|
||||
/* else we need data size for decision function */
|
||||
err = ompi_ddt_get_size (sdtype, &dsize);
|
||||
if (err != MPI_SUCCESS) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
|
||||
return (err);
|
||||
}
|
||||
|
||||
total_dsize = dsize * scount * (unsigned long)comsize; /* needed for decision */
|
||||
total_dsize = dsize * scount * comsize; /* needed for decision */
|
||||
|
||||
if (comsize >= 12 && total_dsize <= 768) {
|
||||
return ompi_coll_tuned_alltoall_intra_bruck (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
else if (total_dsize <= 131072) {
|
||||
if (total_dsize <= 131072) {
|
||||
return ompi_coll_tuned_alltoall_intra_basic_linear (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
return ompi_coll_tuned_alltoall_intra_pairwise (sbuf, scount, sdtype, rbuf, rcount, rdtype, comm);
|
||||
}
|
||||
|
||||
|
||||
@ -122,11 +109,10 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm)
|
||||
|
||||
if (2==comsize)
|
||||
return ompi_coll_tuned_barrier_intra_two_procs(comm);
|
||||
else
|
||||
/* return ompi_coll_tuned_barrier_intra_doublering(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_doublering(comm); */
|
||||
return ompi_coll_tuned_barrier_intra_recursivedoubling(comm);
|
||||
/* return ompi_coll_tuned_barrier_intra_bruck(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_linear(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_bruck(comm); */
|
||||
/* return ompi_coll_tuned_barrier_intra_linear(comm); */
|
||||
|
||||
}
|
||||
|
||||
@ -139,16 +125,12 @@ int ompi_coll_tuned_barrier_intra_dec_fixed(struct ompi_communicator_t *comm)
|
||||
* Returns: - MPI_SUCCESS or error code (passed from the bcast implementation)
|
||||
*/
|
||||
int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *datatype, int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
int comsize;
|
||||
int rank;
|
||||
int err;
|
||||
unsigned long msgsize;
|
||||
unsigned long dsize;
|
||||
int comsize, rank, err;
|
||||
int segsize = 0;
|
||||
|
||||
size_t msgsize, dsize;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"ompi_coll_tuned_bcast_intra_dec_fixed"));
|
||||
|
||||
@ -158,7 +140,7 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
/* else we need data size for decision function */
|
||||
err = ompi_ddt_get_size (datatype, &dsize);
|
||||
if (err != MPI_SUCCESS) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
|
||||
return (err);
|
||||
}
|
||||
|
||||
@ -166,34 +148,29 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
|
||||
/* this is based on gige measurements */
|
||||
|
||||
if ((comsize < 4)) {
|
||||
segsize = 0;
|
||||
if (comsize < 4) {
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
||||
}
|
||||
else if (comsize == 4) {
|
||||
if (msgsize < 524288) segsize = 0;
|
||||
else msgsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||
if (comsize == 4) {
|
||||
if (msgsize < 524288) segsize = 0;
|
||||
else segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||
}
|
||||
else if (comsize > 4 && comsize <= 8 && msgsize < 4096) {
|
||||
segsize = 0;
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
||||
if (comsize <= 8 && msgsize < 4096) {
|
||||
return ompi_coll_tuned_bcast_intra_basic_linear (buff, count, datatype, root, comm);
|
||||
}
|
||||
else if (comsize > 8 && msgsize >= 32768 && msgsize < 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||
if (comsize > 8 && msgsize >= 32768 && msgsize < 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||
}
|
||||
else if (comsize > 4 && msgsize >= 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, segsize);
|
||||
if (msgsize >= 524288) {
|
||||
segsize = 16384;
|
||||
return ompi_coll_tuned_bcast_intra_pipeline (buff, count, datatype, root, comm, segsize);
|
||||
}
|
||||
else {
|
||||
segsize = 0;
|
||||
/* once tested can swap this back in */
|
||||
/* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||
}
|
||||
|
||||
segsize = 0;
|
||||
/* once tested can swap this back in */
|
||||
/* return ompi_coll_tuned_bcast_intra_bmtree (buff, count, datatype, root, comm, segsize); */
|
||||
return ompi_coll_tuned_bcast_intra_bintree (buff, count, datatype, root, comm, segsize);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -205,19 +182,12 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count,
|
||||
*
|
||||
*/
|
||||
int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
int count, struct ompi_datatype_t* datatype,
|
||||
struct ompi_op_t* op, int root,
|
||||
struct ompi_communicator_t* comm)
|
||||
int count, struct ompi_datatype_t* datatype,
|
||||
struct ompi_op_t* op, int root,
|
||||
struct ompi_communicator_t* comm)
|
||||
{
|
||||
int comsize;
|
||||
int rank;
|
||||
int err;
|
||||
/* int contig; */
|
||||
unsigned long msgsize;
|
||||
unsigned long dsize;
|
||||
int segsize = 0;
|
||||
/* int fanout = 0; */
|
||||
|
||||
int comsize, rank, err, segsize = 0, fanout = 0;
|
||||
size_t msgsize, dsize;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_reduce_intra_dec_fixed"));
|
||||
|
||||
@ -227,39 +197,33 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( void *sendbuf, void *recvbuf,
|
||||
/* need data size for decision function */
|
||||
err = ompi_ddt_get_size (datatype, &dsize);
|
||||
if (err != MPI_SUCCESS) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,__LINE__,err,rank));
|
||||
return (err);
|
||||
}
|
||||
|
||||
msgsize = dsize * (unsigned long)count; /* needed for decision */
|
||||
msgsize = dsize * count; /* needed for decision */
|
||||
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
#ifdef coconuts
|
||||
/* for small messages use linear algorithm */
|
||||
if (msgsize <= 4096) {
|
||||
/* for small messages use linear algorithm */
|
||||
if (msgsize <= 4096) {
|
||||
segsize = 0;
|
||||
fanout = size-1;
|
||||
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
|
||||
/* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
|
||||
fanout = comsize - 1;
|
||||
/* when linear implemented or taken from basic put here, right now using chain as a linear system */
|
||||
/* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
|
||||
return ompi_coll_tuned_reduce_intra_basic_linear (sendbuf, recvbuf, count, datatype, op, root, comm);
|
||||
/* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
|
||||
} else if (msgsize <= 65536 ) {
|
||||
segsize = 32768;
|
||||
fanout = 8;
|
||||
/* return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
|
||||
}
|
||||
if (msgsize < 524288) {
|
||||
if (msgsize <= 65536 ) {
|
||||
segsize = 32768;
|
||||
fanout = 8;
|
||||
} else {
|
||||
segsize = 1024;
|
||||
fanout = comsize/2;
|
||||
}
|
||||
/* later swap this for a binary tree */
|
||||
/* fanout = 2; */
|
||||
return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout);
|
||||
} else if (msgsize < 524288) {
|
||||
segsize = 1024;
|
||||
fanout = size/2;
|
||||
/* later swap this for a binary tree */
|
||||
/* fanout = 2; */
|
||||
return ompi_coll_tuned_reduce_intra_chain (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||
}
|
||||
|
||||
}
|
||||
segsize = 1024;
|
||||
return ompi_coll_tuned_reduce_intra_pipeline (sendbuf, recvbuf, count, datatype, op, root, comm, segsize);
|
||||
}
|
||||
|
||||
|
||||
|
@ -60,235 +60,235 @@ static int fileline=0; /* used for verbose error messages */
|
||||
|
||||
int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives)
|
||||
{
|
||||
FILE *fptr = (FILE*) NULL;
|
||||
int X;
|
||||
int CI;
|
||||
int NCS;
|
||||
int CS;
|
||||
int NMS;
|
||||
int MS, ALG, FANINOUT, SS;
|
||||
int x, ncs, nms;
|
||||
FILE *fptr = (FILE*) NULL;
|
||||
int X;
|
||||
int CI;
|
||||
int NCS;
|
||||
int CS;
|
||||
int NMS;
|
||||
int MS, ALG, FANINOUT, SS;
|
||||
int x, ncs, nms;
|
||||
|
||||
ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */
|
||||
ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */
|
||||
|
||||
/* individual pointers to sections of rules */
|
||||
ompi_coll_alg_rule_t *alg_p = (ompi_coll_alg_rule_t*) NULL;
|
||||
ompi_coll_com_rule_t *com_p = (ompi_coll_com_rule_t*) NULL;
|
||||
ompi_coll_msg_rule_t *msg_p = (ompi_coll_msg_rule_t*) NULL;
|
||||
/* individual pointers to sections of rules */
|
||||
ompi_coll_alg_rule_t *alg_p = (ompi_coll_alg_rule_t*) NULL;
|
||||
ompi_coll_com_rule_t *com_p = (ompi_coll_com_rule_t*) NULL;
|
||||
ompi_coll_msg_rule_t *msg_p = (ompi_coll_msg_rule_t*) NULL;
|
||||
|
||||
/* stats info */
|
||||
int total_alg_count = 0;
|
||||
int total_com_count = 0;
|
||||
int total_msg_count = 0;
|
||||
/* stats info */
|
||||
int total_alg_count = 0;
|
||||
int total_com_count = 0;
|
||||
int total_msg_count = 0;
|
||||
|
||||
if (!fname) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave NULL as rule table configuration file for tuned collectives... ignoring!\n"));
|
||||
return (-1);
|
||||
}
|
||||
if (!fname) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave NULL as rule table configuration file for tuned collectives... ignoring!\n"));
|
||||
return (-1);
|
||||
}
|
||||
|
||||
if (!rules) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave NULL as rule table result ptr!... ignoring!\n"));
|
||||
return (-2);
|
||||
}
|
||||
if (!rules) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave NULL as rule table result ptr!... ignoring!\n"));
|
||||
return (-2);
|
||||
}
|
||||
|
||||
if (n_collectives<1) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave %d as max number of collectives in the rule table configuration file for tuned collectives!... ignoring!\n", n_collectives));
|
||||
return (-3);
|
||||
}
|
||||
if (n_collectives<1) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Gave %d as max number of collectives in the rule table configuration file for tuned collectives!... ignoring!\n", n_collectives));
|
||||
return (-3);
|
||||
}
|
||||
|
||||
fptr = fopen (fname, "r");
|
||||
if (!fptr) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"cannot read rules file [%s]\n", fname));
|
||||
goto on_file_error;
|
||||
}
|
||||
fptr = fopen (fname, "r");
|
||||
if (!fptr) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"cannot read rules file [%s]\n", fname));
|
||||
goto on_file_error;
|
||||
}
|
||||
|
||||
/* make space and init the algorithm rules for each of the n_collectives MPI collectives */
|
||||
alg_rules = ompi_coll_tuned_mk_alg_rules (n_collectives);
|
||||
/* make space and init the algorithm rules for each of the n_collectives MPI collectives */
|
||||
alg_rules = ompi_coll_tuned_mk_alg_rules (n_collectives);
|
||||
|
||||
X = getnext(fptr);
|
||||
if (X<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
if (X>n_collectives) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %d is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
X = getnext(fptr);
|
||||
if (X<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
if (X>n_collectives) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %d is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
|
||||
for (x=0;x<X;x++) { /* for each collective */
|
||||
for (x=0;x<X;x++) { /* for each collective */
|
||||
|
||||
CI = getnext (fptr);
|
||||
if (CI<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read next Collective id in configuration file around line %d\n", fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
if (CI>=n_collectives) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %d is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
CI = getnext (fptr);
|
||||
if (CI<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read next Collective id in configuration file around line %d\n", fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
if (CI>=n_collectives) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %d is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
|
||||
if (alg_rules[CI].alg_rule_id != CI) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI));
|
||||
ompi_coll_tuned_free_all_rules (*rules, n_collectives);
|
||||
return (-4);
|
||||
}
|
||||
if (alg_rules[CI].alg_rule_id != CI) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI));
|
||||
ompi_coll_tuned_free_all_rules (*rules, n_collectives);
|
||||
return (-4);
|
||||
}
|
||||
|
||||
alg_p = &alg_rules[CI];
|
||||
alg_p = &alg_rules[CI];
|
||||
|
||||
alg_p->alg_rule_id = CI;
|
||||
alg_p->n_com_sizes = 0;
|
||||
alg_p->com_rules = (ompi_coll_com_rule_t *) NULL;
|
||||
alg_p->alg_rule_id = CI;
|
||||
alg_p->n_com_sizes = 0;
|
||||
alg_p->com_rules = (ompi_coll_com_rule_t *) NULL;
|
||||
|
||||
NCS = getnext (fptr);
|
||||
if (NCS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %d at around line %d\n", CI, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
NCS = getnext (fptr);
|
||||
if (NCS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %d at around line %d\n", CI, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
|
||||
alg_p->n_com_sizes = NCS;
|
||||
alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI);
|
||||
alg_p->n_com_sizes = NCS;
|
||||
alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI);
|
||||
|
||||
for (ncs=0;ncs<NCS;ncs++) { /* for each comm size */
|
||||
for (ncs=0;ncs<NCS;ncs++) { /* for each comm size */
|
||||
|
||||
com_p = &(alg_p->com_rules[ncs]);
|
||||
com_p = &(alg_p->com_rules[ncs]);
|
||||
|
||||
CS = getnext (fptr);
|
||||
if (CS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
|
||||
com_p->mpi_comsize = CS;
|
||||
|
||||
NMS = getnext (fptr);
|
||||
if (NMS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
|
||||
com_p->n_msg_sizes = NMS;
|
||||
com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS);
|
||||
|
||||
msg_p = com_p->msg_rules;
|
||||
|
||||
for (nms=0;nms<NMS;nms++) { /* for each msg size */
|
||||
|
||||
msg_p = &(com_p->msg_rules[nms]);
|
||||
|
||||
MS = getnext (fptr);
|
||||
if (MS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
|
||||
CS = getnext (fptr);
|
||||
if (CS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
msg_p->msg_size = MS;
|
||||
|
||||
ALG = getnext (fptr);
|
||||
if (ALG<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
|
||||
com_p->mpi_comsize = CS;
|
||||
|
||||
NMS = getnext (fptr);
|
||||
if (NMS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
msg_p->result_alg = ALG;
|
||||
|
||||
FANINOUT = getnext (fptr);
|
||||
if (FANINOUT<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
msg_p->result_topo_faninout = FANINOUT;
|
||||
com_p->n_msg_sizes = NMS;
|
||||
com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS);
|
||||
|
||||
SS = getnext (fptr);
|
||||
if (SS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
msg_p->result_segsize = SS;
|
||||
msg_p = com_p->msg_rules;
|
||||
|
||||
if (!nms && MS) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %d for collective ID %d com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
for (nms=0;nms<NMS;nms++) { /* for each msg size */
|
||||
|
||||
total_msg_count++;
|
||||
msg_p = &(com_p->msg_rules[nms]);
|
||||
|
||||
} /* msg size */
|
||||
MS = getnext (fptr);
|
||||
if (MS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
msg_p->msg_size = MS;
|
||||
|
||||
total_com_count++;
|
||||
ALG = getnext (fptr);
|
||||
if (ALG<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
msg_p->result_alg = ALG;
|
||||
|
||||
} /* comm size */
|
||||
FANINOUT = getnext (fptr);
|
||||
if (FANINOUT<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
msg_p->result_topo_faninout = FANINOUT;
|
||||
|
||||
total_alg_count++;
|
||||
SS = getnext (fptr);
|
||||
if (SS<0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
msg_p->result_segsize = SS;
|
||||
|
||||
} /* per collective */
|
||||
if (!nms && MS) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %d for collective ID %d com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline));
|
||||
goto on_file_error;
|
||||
}
|
||||
|
||||
total_msg_count++;
|
||||
|
||||
} /* msg size */
|
||||
|
||||
total_com_count++;
|
||||
|
||||
} /* comm size */
|
||||
|
||||
total_alg_count++;
|
||||
|
||||
} /* per collective */
|
||||
|
||||
fclose (fptr);
|
||||
fclose (fptr);
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"\nConfigure file Stats\n"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Collectives with rules\t\t\t: %5d\n", total_alg_count));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Communicator sizes with rules\t\t: %5d\n", total_com_count));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Message sizes with rules\t\t: %5d\n", total_msg_count));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Lines in configuration file read\t\t: %5d\n", fileline));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"\nConfigure file Stats\n"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Collectives with rules\t\t\t: %5d\n", total_alg_count));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Communicator sizes with rules\t\t: %5d\n", total_com_count));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Message sizes with rules\t\t: %5d\n", total_msg_count));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Lines in configuration file read\t\t: %5d\n", fileline));
|
||||
|
||||
/* return the rules to the caller */
|
||||
*rules = alg_rules;
|
||||
/* return the rules to the caller */
|
||||
*rules = alg_rules;
|
||||
|
||||
return (total_alg_count);
|
||||
return (total_alg_count);
|
||||
|
||||
|
||||
on_file_error:
|
||||
on_file_error:
|
||||
|
||||
/* here we close out the file and delete any memory allocated nicely */
|
||||
/* we return back a verbose message and a count of -1 algorithms read */
|
||||
/* draconian but its better than having a bad collective decision table */
|
||||
/* here we close out the file and delete any memory allocated nicely */
|
||||
/* we return back a verbose message and a count of -1 algorithms read */
|
||||
/* draconian but its better than having a bad collective decision table */
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"read_rules_config_file: bad configure file [%s]. Read afar as line %d\n", fname, fileline));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Ignoring user supplied tuned collectives configuration decision file.\n"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Switching back to [compiled in] fixed decision table.\n"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Fix errors as listed above and try again.\n"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"read_rules_config_file: bad configure file [%s]. Read afar as line %d\n", fname, fileline));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Ignoring user supplied tuned collectives configuration decision file.\n"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Switching back to [compiled in] fixed decision table.\n"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Fix errors as listed above and try again.\n"));
|
||||
|
||||
/* deallocate memory if allocated */
|
||||
if (alg_rules) ompi_coll_tuned_free_all_rules (alg_rules, n_collectives);
|
||||
/* deallocate memory if allocated */
|
||||
if (alg_rules) ompi_coll_tuned_free_all_rules (alg_rules, n_collectives);
|
||||
|
||||
/* close file */
|
||||
if (fptr) fclose (fptr);
|
||||
/* close file */
|
||||
if (fptr) fclose (fptr);
|
||||
|
||||
*rules = (ompi_coll_alg_rule_t*) NULL;
|
||||
return (-1);
|
||||
*rules = (ompi_coll_alg_rule_t*) NULL;
|
||||
return (-1);
|
||||
}
|
||||
|
||||
|
||||
static int getnext (FILE *fptr)
|
||||
{
|
||||
int val;
|
||||
int rc;
|
||||
char trash;
|
||||
int val;
|
||||
int rc;
|
||||
char trash;
|
||||
|
||||
do {
|
||||
rc = fscanf(fptr, "%d", &val);
|
||||
if (rc==EOF) return (MYEOF);
|
||||
if (1==rc) return (val);
|
||||
else {
|
||||
rc = fread(&trash, 1, 1, fptr);
|
||||
if ('\n'==trash) fileline++;
|
||||
if ('#'==trash) skiptonewline (fptr);
|
||||
}
|
||||
} while (1);
|
||||
do {
|
||||
rc = fscanf(fptr, "%d", &val);
|
||||
if (rc==EOF) return (MYEOF);
|
||||
if (1==rc) return (val);
|
||||
else {
|
||||
rc = fread(&trash, 1, 1, fptr);
|
||||
if ('\n'==trash) fileline++;
|
||||
if ('#'==trash) skiptonewline (fptr);
|
||||
}
|
||||
} while (1);
|
||||
|
||||
return rc;
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void skiptonewline (FILE *fptr)
|
||||
{
|
||||
char val;
|
||||
int rc;
|
||||
char val;
|
||||
int rc;
|
||||
|
||||
do {
|
||||
rc = fread(&val, 1, 1, fptr);
|
||||
if (0==rc) return;
|
||||
if ((1==rc)&&('\n'==val)) {
|
||||
fileline++;
|
||||
return;
|
||||
}
|
||||
} while (1);
|
||||
do {
|
||||
rc = fread(&val, 1, 1, fptr);
|
||||
if (0==rc) return;
|
||||
if ((1==rc)&&('\n'==val)) {
|
||||
fileline++;
|
||||
return;
|
||||
}
|
||||
} while (1);
|
||||
}
|
||||
|
||||
|
@ -41,36 +41,36 @@
|
||||
|
||||
ompi_coll_alg_rule_t* ompi_coll_tuned_mk_alg_rules (int n_alg)
|
||||
{
|
||||
int i;
|
||||
ompi_coll_alg_rule_t* alg_rules;
|
||||
int i;
|
||||
ompi_coll_alg_rule_t* alg_rules;
|
||||
|
||||
alg_rules = (ompi_coll_alg_rule_t *) calloc (n_alg, sizeof (ompi_coll_alg_rule_t));
|
||||
if (!alg_rules) return (alg_rules);
|
||||
alg_rules = (ompi_coll_alg_rule_t *) calloc (n_alg, sizeof (ompi_coll_alg_rule_t));
|
||||
if (!alg_rules) return (alg_rules);
|
||||
|
||||
/* set all we can at this point */
|
||||
for (i=0;i<n_alg;i++) {
|
||||
alg_rules[i].alg_rule_id = i;
|
||||
}
|
||||
return (alg_rules);
|
||||
/* set all we can at this point */
|
||||
for (i=0;i<n_alg;i++) {
|
||||
alg_rules[i].alg_rule_id = i;
|
||||
}
|
||||
return (alg_rules);
|
||||
}
|
||||
|
||||
|
||||
ompi_coll_com_rule_t* ompi_coll_tuned_mk_com_rules (int n_com_rules, int alg_rule_id)
|
||||
{
|
||||
int i;
|
||||
ompi_coll_com_rule_t * com_rules;
|
||||
int i;
|
||||
ompi_coll_com_rule_t * com_rules;
|
||||
|
||||
com_rules = (ompi_coll_com_rule_t *) calloc (n_com_rules, sizeof (ompi_coll_com_rule_t));
|
||||
if (!com_rules) return (com_rules);
|
||||
com_rules = (ompi_coll_com_rule_t *) calloc (n_com_rules, sizeof (ompi_coll_com_rule_t));
|
||||
if (!com_rules) return (com_rules);
|
||||
|
||||
for (i=0;i<n_com_rules;i++) {
|
||||
com_rules[i].mpi_comsize = 0; /* unknown */
|
||||
com_rules[i].alg_rule_id = alg_rule_id;
|
||||
com_rules[i].com_rule_id = i;
|
||||
com_rules[i].n_msg_sizes = 0; /* unknown */
|
||||
com_rules[i].msg_rules = (ompi_coll_msg_rule_t *) NULL;
|
||||
}
|
||||
return (com_rules);
|
||||
for (i=0;i<n_com_rules;i++) {
|
||||
com_rules[i].mpi_comsize = 0; /* unknown */
|
||||
com_rules[i].alg_rule_id = alg_rule_id;
|
||||
com_rules[i].com_rule_id = i;
|
||||
com_rules[i].n_msg_sizes = 0; /* unknown */
|
||||
com_rules[i].msg_rules = (ompi_coll_msg_rule_t *) NULL;
|
||||
}
|
||||
return (com_rules);
|
||||
}
|
||||
|
||||
|
||||
@ -83,14 +83,14 @@ ompi_coll_msg_rule_t* ompi_coll_tuned_mk_msg_rules (int n_msg_rules, int alg_rul
|
||||
if (!msg_rules) return (msg_rules);
|
||||
|
||||
for (i=0;i<n_msg_rules;i++) {
|
||||
msg_rules[i].mpi_comsize = mpi_comsize;
|
||||
msg_rules[i].alg_rule_id = alg_rule_id;
|
||||
msg_rules[i].com_rule_id = com_rule_id;
|
||||
msg_rules[i].msg_rule_id = i;
|
||||
msg_rules[i].msg_size = 0; /* unknown */
|
||||
msg_rules[i].result_alg = 0; /* unknown */
|
||||
msg_rules[i].result_topo_faninout = 0; /* unknown */
|
||||
msg_rules[i].result_segsize = 0; /* unknown */
|
||||
msg_rules[i].mpi_comsize = mpi_comsize;
|
||||
msg_rules[i].alg_rule_id = alg_rule_id;
|
||||
msg_rules[i].com_rule_id = com_rule_id;
|
||||
msg_rules[i].msg_rule_id = i;
|
||||
msg_rules[i].msg_size = 0; /* unknown */
|
||||
msg_rules[i].result_alg = 0; /* unknown */
|
||||
msg_rules[i].result_topo_faninout = 0; /* unknown */
|
||||
msg_rules[i].result_segsize = 0; /* unknown */
|
||||
}
|
||||
return (msg_rules);
|
||||
}
|
||||
@ -104,89 +104,89 @@ ompi_coll_msg_rule_t* ompi_coll_tuned_mk_msg_rules (int n_msg_rules, int alg_rul
|
||||
|
||||
int ompi_coll_tuned_dump_msg_rule (ompi_coll_msg_rule_t* msg_p)
|
||||
{
|
||||
if (!msg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Message rule was a NULL ptr?!\n"));
|
||||
return (-1);
|
||||
}
|
||||
if (!msg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Message rule was a NULL ptr?!\n"));
|
||||
return (-1);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\tcom_id %3d\tcom_size %3d\tmsg_id %3d\t", msg_p->alg_rule_id,
|
||||
msg_p->com_rule_id, msg_p->mpi_comsize, msg_p->msg_rule_id));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\tcom_id %3d\tcom_size %3d\tmsg_id %3d\t", msg_p->alg_rule_id,
|
||||
msg_p->com_rule_id, msg_p->mpi_comsize, msg_p->msg_rule_id));
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %6d -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\n",
|
||||
msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"msg_size %6d -> algorithm %2d\ttopo in/out %2d\tsegsize %5ld\n",
|
||||
msg_p->msg_size, msg_p->result_alg, msg_p->result_topo_faninout, msg_p->result_segsize));
|
||||
|
||||
return (0);
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_dump_com_rule (ompi_coll_com_rule_t* com_p)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
if (!com_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Com rule was a NULL ptr?!\n"));
|
||||
return (-1);
|
||||
}
|
||||
if (!com_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Com rule was a NULL ptr?!\n"));
|
||||
return (-1);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "alg_id %3d\tcom_id %3d\tcom_size %3d\t", com_p->alg_rule_id, com_p->com_rule_id, com_p->mpi_comsize));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "alg_id %3d\tcom_id %3d\tcom_size %3d\t", com_p->alg_rule_id, com_p->com_rule_id, com_p->mpi_comsize));
|
||||
|
||||
if (!com_p->n_msg_sizes) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"no msgsizes defined\n"));
|
||||
return (0);
|
||||
}
|
||||
if (!com_p->n_msg_sizes) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"no msgsizes defined\n"));
|
||||
return (0);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"number of message sizes %3d\n", com_p->n_msg_sizes));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"number of message sizes %3d\n", com_p->n_msg_sizes));
|
||||
|
||||
for (i=0;i<com_p->n_msg_sizes;i++) {
|
||||
ompi_coll_tuned_dump_msg_rule (&(com_p->msg_rules[i]));
|
||||
}
|
||||
for (i=0;i<com_p->n_msg_sizes;i++) {
|
||||
ompi_coll_tuned_dump_msg_rule (&(com_p->msg_rules[i]));
|
||||
}
|
||||
|
||||
return (0);
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_dump_alg_rule (ompi_coll_alg_rule_t* alg_p)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
if (!alg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n"));
|
||||
return (-1);
|
||||
}
|
||||
if (!alg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n"));
|
||||
return (-1);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\t", alg_p->alg_rule_id));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"alg_id %3d\t", alg_p->alg_rule_id));
|
||||
|
||||
if (!alg_p->n_com_sizes) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"no coms defined\n"));
|
||||
return (0);
|
||||
}
|
||||
if (!alg_p->n_com_sizes) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"no coms defined\n"));
|
||||
return (0);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"number of com sizes %3d\n", alg_p->n_com_sizes));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"number of com sizes %3d\n", alg_p->n_com_sizes));
|
||||
|
||||
for (i=0;i<alg_p->n_com_sizes;i++) {
|
||||
ompi_coll_tuned_dump_com_rule (&(alg_p->com_rules[i]));
|
||||
}
|
||||
for (i=0;i<alg_p->n_com_sizes;i++) {
|
||||
ompi_coll_tuned_dump_com_rule (&(alg_p->com_rules[i]));
|
||||
}
|
||||
|
||||
return (0);
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_dump_all_rules (ompi_coll_alg_rule_t* alg_p, int n_rules)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
if (!alg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n"));
|
||||
return (-1);
|
||||
}
|
||||
if (!alg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Algorithm rule was a NULL ptr?!\n"));
|
||||
return (-1);
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of algorithm rules %3d\n", n_rules));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of algorithm rules %3d\n", n_rules));
|
||||
|
||||
for (i=0;i<n_rules;i++) {
|
||||
ompi_coll_tuned_dump_alg_rule (&(alg_p[i]));
|
||||
}
|
||||
for (i=0;i<n_rules;i++) {
|
||||
ompi_coll_tuned_dump_alg_rule (&(alg_p[i]));
|
||||
}
|
||||
|
||||
return (0);
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
||||
@ -198,82 +198,81 @@ int ompi_coll_tuned_dump_all_rules (ompi_coll_alg_rule_t* alg_p, int n_rules)
|
||||
|
||||
int ompi_coll_tuned_free_msg_rules_in_com_rule (ompi_coll_com_rule_t* com_p)
|
||||
{
|
||||
int rc=0;
|
||||
ompi_coll_msg_rule_t* msg_p;
|
||||
int rc=0;
|
||||
ompi_coll_msg_rule_t* msg_p;
|
||||
|
||||
if (!com_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL com_rule ptr\n"));
|
||||
return (-1);
|
||||
}
|
||||
if (!com_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL com_rule ptr\n"));
|
||||
return (-1);
|
||||
}
|
||||
|
||||
if (com_p->n_msg_sizes) {
|
||||
msg_p = com_p->msg_rules;
|
||||
if (com_p->n_msg_sizes) {
|
||||
msg_p = com_p->msg_rules;
|
||||
|
||||
if (!msg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL msg_rules when msg count was %d\n", com_p->n_msg_sizes));
|
||||
rc = -1; /* some error */
|
||||
}
|
||||
else {
|
||||
/* ok, memory exists for the msg rules so free that first */
|
||||
free (com_p->msg_rules);
|
||||
com_p->msg_rules = (ompi_coll_msg_rule_t*) NULL;
|
||||
}
|
||||
if (!msg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL msg_rules when msg count was %d\n", com_p->n_msg_sizes));
|
||||
rc = -1; /* some error */
|
||||
}
|
||||
else {
|
||||
/* ok, memory exists for the msg rules so free that first */
|
||||
free (com_p->msg_rules);
|
||||
com_p->msg_rules = (ompi_coll_msg_rule_t*) NULL;
|
||||
}
|
||||
|
||||
} /* if we have msg rules to free as well */
|
||||
} /* if we have msg rules to free as well */
|
||||
|
||||
|
||||
return (rc);
|
||||
return (rc);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int ompi_coll_tuned_free_coms_in_alg_rule (ompi_coll_alg_rule_t* alg_p)
|
||||
{
|
||||
int rc=0;
|
||||
int i;
|
||||
int rc=0;
|
||||
int i;
|
||||
|
||||
ompi_coll_com_rule_t* com_p;
|
||||
ompi_coll_com_rule_t* com_p;
|
||||
|
||||
if (!alg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL alg_rule ptr\n"));
|
||||
return (-1);
|
||||
}
|
||||
if (!alg_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL alg_rule ptr\n"));
|
||||
return (-1);
|
||||
}
|
||||
|
||||
if (alg_p->n_com_sizes) {
|
||||
com_p = alg_p->com_rules;
|
||||
if (alg_p->n_com_sizes) {
|
||||
com_p = alg_p->com_rules;
|
||||
|
||||
if (!com_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL com_rules when com count was %d\n", alg_p->n_com_sizes));
|
||||
}
|
||||
else {
|
||||
/* ok, memory exists for the com rules so free their message rules first */
|
||||
for (i=0;i<alg_p->n_com_sizes;i++) {
|
||||
com_p = &(alg_p->com_rules[i]);
|
||||
ompi_coll_tuned_free_msg_rules_in_com_rule (com_p);
|
||||
if (!com_p) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"attempt to free NULL com_rules when com count was %d\n", alg_p->n_com_sizes));
|
||||
}
|
||||
else {
|
||||
/* ok, memory exists for the com rules so free their message rules first */
|
||||
for (i=0;i<alg_p->n_com_sizes;i++) {
|
||||
com_p = &(alg_p->com_rules[i]);
|
||||
ompi_coll_tuned_free_msg_rules_in_com_rule (com_p);
|
||||
}
|
||||
/* we are now free to free the com rules themselives */
|
||||
free (alg_p->com_rules);
|
||||
alg_p->com_rules = (ompi_coll_com_rule_t*) NULL;
|
||||
}
|
||||
/* we are now free to free the com rules themselives */
|
||||
free (alg_p->com_rules);
|
||||
alg_p->com_rules = (ompi_coll_com_rule_t*) NULL;
|
||||
}
|
||||
|
||||
} /* if we have msg rules to free as well */
|
||||
} /* if we have msg rules to free as well */
|
||||
|
||||
return (rc);
|
||||
return (rc);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
|
||||
{
|
||||
int i;
|
||||
int rc = 0;
|
||||
int i;
|
||||
int rc = 0;
|
||||
|
||||
for(i=0;i<n_algs;i++) {
|
||||
rc += ompi_coll_tuned_free_coms_in_alg_rule (&(alg_p[i]));
|
||||
}
|
||||
for(i=0;i<n_algs;i++) {
|
||||
rc += ompi_coll_tuned_free_coms_in_alg_rule (&(alg_p[i]));
|
||||
}
|
||||
|
||||
free (alg_p);
|
||||
free (alg_p);
|
||||
|
||||
return (rc);
|
||||
return (rc);
|
||||
}
|
||||
|
||||
|
||||
@ -296,48 +295,48 @@ int ompi_coll_tuned_free_all_rules (ompi_coll_alg_rule_t* alg_p, int n_algs)
|
||||
*/
|
||||
ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* rules, int alg_id, int mpi_comsize)
|
||||
{
|
||||
ompi_coll_alg_rule_t* alg_p = (ompi_coll_alg_rule_t*) NULL;
|
||||
ompi_coll_com_rule_t* com_p = (ompi_coll_com_rule_t*) NULL;
|
||||
ompi_coll_com_rule_t* best_com_p = (ompi_coll_com_rule_t*) NULL;
|
||||
int i, best;
|
||||
ompi_coll_alg_rule_t* alg_p = (ompi_coll_alg_rule_t*) NULL;
|
||||
ompi_coll_com_rule_t* com_p = (ompi_coll_com_rule_t*) NULL;
|
||||
ompi_coll_com_rule_t* best_com_p = (ompi_coll_com_rule_t*) NULL;
|
||||
int i, best;
|
||||
|
||||
if (!rules) { /* no rule base no resulting com rule */
|
||||
return ((ompi_coll_com_rule_t*)NULL);
|
||||
}
|
||||
if (!rules) { /* no rule base no resulting com rule */
|
||||
return ((ompi_coll_com_rule_t*)NULL);
|
||||
}
|
||||
|
||||
alg_p = &(rules[alg_id]); /* get the algorithm rule pointer */
|
||||
alg_p = &(rules[alg_id]); /* get the algorithm rule pointer */
|
||||
|
||||
if (!alg_p->n_com_sizes) { /* check for count of communicator sizes */
|
||||
return ((ompi_coll_com_rule_t*)NULL); /* no com sizes so no rule */
|
||||
}
|
||||
if (!alg_p->n_com_sizes) { /* check for count of communicator sizes */
|
||||
return ((ompi_coll_com_rule_t*)NULL); /* no com sizes so no rule */
|
||||
}
|
||||
|
||||
/* ok have some com sizes, now to find the one closest to my mpi_comsize */
|
||||
/* ok have some com sizes, now to find the one closest to my mpi_comsize */
|
||||
|
||||
/* make a copy of the first com rule */
|
||||
best_com_p = com_p = alg_p->com_rules;
|
||||
i = best = 0;
|
||||
/* make a copy of the first com rule */
|
||||
best_com_p = com_p = alg_p->com_rules;
|
||||
i = best = 0;
|
||||
|
||||
while (i<alg_p->n_com_sizes) {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking comsize %d against alg_id %d com_id %d index %d com_size %d", */
|
||||
/* mpi_comsize, com_p->alg_rule_id, com_p->com_rule_id, i, com_p->mpi_comsize)); */
|
||||
if (com_p->mpi_comsize <= mpi_comsize) {
|
||||
best = i;
|
||||
best_com_p = com_p;
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */
|
||||
}
|
||||
else {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */
|
||||
break;
|
||||
}
|
||||
/* go to the next entry */
|
||||
com_p++;
|
||||
i++;
|
||||
}
|
||||
while (i<alg_p->n_com_sizes) {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking comsize %d against alg_id %d com_id %d index %d com_size %d", */
|
||||
/* mpi_comsize, com_p->alg_rule_id, com_p->com_rule_id, i, com_p->mpi_comsize)); */
|
||||
if (com_p->mpi_comsize <= mpi_comsize) {
|
||||
best = i;
|
||||
best_com_p = com_p;
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */
|
||||
}
|
||||
else {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */
|
||||
break;
|
||||
}
|
||||
/* go to the next entry */
|
||||
com_p++;
|
||||
i++;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Selected the following com rule id %d\n", best_com_p->com_rule_id));
|
||||
ompi_coll_tuned_dump_com_rule (best_com_p);
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Selected the following com rule id %d\n", best_com_p->com_rule_id));
|
||||
ompi_coll_tuned_dump_com_rule (best_com_p);
|
||||
|
||||
return (best_com_p);
|
||||
return (best_com_p);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -356,61 +355,61 @@ ompi_coll_com_rule_t* ompi_coll_tuned_get_com_rule_ptr (ompi_coll_alg_rule_t* ru
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_get_target_method_params (ompi_coll_com_rule_t* base_com_rule, int mpi_msgsize, int *result_topo_faninout,
|
||||
int* result_segsize)
|
||||
int* result_segsize)
|
||||
{
|
||||
ompi_coll_msg_rule_t* msg_p = (ompi_coll_msg_rule_t*) NULL;
|
||||
ompi_coll_msg_rule_t* best_msg_p = (ompi_coll_msg_rule_t*) NULL;
|
||||
int i, best;
|
||||
ompi_coll_msg_rule_t* msg_p = (ompi_coll_msg_rule_t*) NULL;
|
||||
ompi_coll_msg_rule_t* best_msg_p = (ompi_coll_msg_rule_t*) NULL;
|
||||
int i, best;
|
||||
|
||||
if (!base_com_rule) {
|
||||
return (0);
|
||||
}
|
||||
if (!base_com_rule) {
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (!result_topo_faninout) {
|
||||
return (0);
|
||||
}
|
||||
if (!result_topo_faninout) {
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (!result_segsize) {
|
||||
return (0);
|
||||
}
|
||||
if (!result_segsize) {
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (!base_com_rule->n_msg_sizes) { /* check for count of message sizes */
|
||||
return (0); /* no msg sizes so no rule */
|
||||
}
|
||||
if (!base_com_rule->n_msg_sizes) { /* check for count of message sizes */
|
||||
return (0); /* no msg sizes so no rule */
|
||||
}
|
||||
|
||||
/* ok have some msg sizes, now to find the one closest to my mpi_msgsize */
|
||||
/* ok have some msg sizes, now to find the one closest to my mpi_msgsize */
|
||||
|
||||
/* make a copy of the first msg rule */
|
||||
best_msg_p = msg_p = base_com_rule->msg_rules;
|
||||
i = best = 0;
|
||||
/* make a copy of the first msg rule */
|
||||
best_msg_p = msg_p = base_com_rule->msg_rules;
|
||||
i = best = 0;
|
||||
|
||||
while (i<base_com_rule->n_msg_sizes) {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking mpi_msgsize %d against com_id %d msg_id %d index %d msg_size %d", */
|
||||
/* mpi_msgsize, msg_p->com_rule_id, msg_p->msg_rule_id, i, msg_p->msg_size)); */
|
||||
if (msg_p->msg_size <= mpi_msgsize) {
|
||||
best = i;
|
||||
best_msg_p = msg_p;
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */
|
||||
}
|
||||
else {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */
|
||||
break;
|
||||
}
|
||||
/* go to the next entry */
|
||||
msg_p++;
|
||||
i++;
|
||||
}
|
||||
while (i<base_com_rule->n_msg_sizes) {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream,"checking mpi_msgsize %d against com_id %d msg_id %d index %d msg_size %d", */
|
||||
/* mpi_msgsize, msg_p->com_rule_id, msg_p->msg_rule_id, i, msg_p->msg_size)); */
|
||||
if (msg_p->msg_size <= mpi_msgsize) {
|
||||
best = i;
|
||||
best_msg_p = msg_p;
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":ok\n")); */
|
||||
}
|
||||
else {
|
||||
/* OPAL_OUTPUT((ompi_coll_tuned_stream(":nop\n")); */
|
||||
break;
|
||||
}
|
||||
/* go to the next entry */
|
||||
msg_p++;
|
||||
i++;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Selected the following msg rule id %d\n", best_msg_p->msg_rule_id));
|
||||
ompi_coll_tuned_dump_msg_rule (best_msg_p);
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"Selected the following msg rule id %d\n", best_msg_p->msg_rule_id));
|
||||
ompi_coll_tuned_dump_msg_rule (best_msg_p);
|
||||
|
||||
/* return the segment size */
|
||||
*result_topo_faninout = best_msg_p->result_topo_faninout;
|
||||
/* return the segment size */
|
||||
*result_topo_faninout = best_msg_p->result_topo_faninout;
|
||||
|
||||
/* return the segment size */
|
||||
*result_segsize = best_msg_p->result_segsize;
|
||||
/* return the segment size */
|
||||
*result_segsize = best_msg_p->result_segsize;
|
||||
|
||||
/* return the algorithm/method to use */
|
||||
return (best_msg_p->result_alg);
|
||||
/* return the algorithm/method to use */
|
||||
return (best_msg_p->result_alg);
|
||||
}
|
||||
|
||||
|
@ -42,24 +42,24 @@
|
||||
/* recheck the setting of forced, called on module create (i.e. for each new comm) */
|
||||
|
||||
int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
|
||||
coll_tuned_force_algorithm_params_t *forced_values)
|
||||
coll_tuned_force_algorithm_params_t *forced_values)
|
||||
{
|
||||
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
|
||||
mca_base_param_lookup_int (mca_params.segsize_param_index, &(forced_values->segsize));
|
||||
mca_base_param_lookup_int (mca_params.tree_fanout_param_index, &(forced_values->tree_fanout));
|
||||
mca_base_param_lookup_int (mca_params.chain_fanout_param_index, &(forced_values->chain_fanout));
|
||||
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
|
||||
mca_base_param_lookup_int (mca_params.segsize_param_index, &(forced_values->segsize));
|
||||
mca_base_param_lookup_int (mca_params.tree_fanout_param_index, &(forced_values->tree_fanout));
|
||||
mca_base_param_lookup_int (mca_params.chain_fanout_param_index, &(forced_values->chain_fanout));
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
/* special version of above just for barrier which only has one option available (at the moment...) */
|
||||
int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
|
||||
coll_tuned_force_algorithm_params_t *forced_values)
|
||||
coll_tuned_force_algorithm_params_t *forced_values)
|
||||
{
|
||||
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
|
||||
mca_base_param_lookup_int (mca_params.algorithm_param_index, &(forced_values->algorithm));
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
@ -61,8 +61,6 @@ int ompi_coll_tuned_forced_getvalues (coll_tuned_force_algorithm_mca_param_indic
|
||||
int ompi_coll_tuned_forced_getvalues_barrier (coll_tuned_force_algorithm_mca_param_indices_t mca_params,
|
||||
coll_tuned_force_algorithm_params_t *forced_values);
|
||||
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -45,87 +45,87 @@ static const mca_coll_base_module_1_0_0_t *to_use = NULL;
|
||||
*/
|
||||
static const mca_coll_base_module_1_0_0_t intra_fixed = {
|
||||
|
||||
/* Initialization / finalization functions */
|
||||
/* Initialization / finalization functions */
|
||||
|
||||
ompi_coll_tuned_module_init,
|
||||
ompi_coll_tuned_module_finalize,
|
||||
ompi_coll_tuned_module_init,
|
||||
ompi_coll_tuned_module_finalize,
|
||||
|
||||
/* Collective function pointers */
|
||||
/* Collective function pointers */
|
||||
|
||||
/* ompi_coll_tuned_allgather_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_allgather_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allgatherv_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_allgatherv_intra_dec_fixed, */
|
||||
NULL,
|
||||
ompi_coll_tuned_allreduce_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_alltoall_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_alltoallv_intra_dec_fixed, */
|
||||
ompi_coll_tuned_allreduce_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_alltoall_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_alltoallv_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallw_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_alltoallw_intra_dec_fixed, */
|
||||
NULL,
|
||||
ompi_coll_tuned_barrier_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_bcast_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_exscan_intra_dec_fixed, */
|
||||
ompi_coll_tuned_barrier_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_bcast_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_exscan_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gather_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_gather_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gatherv_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_gatherv_intra_dec_fixed, */
|
||||
NULL,
|
||||
ompi_coll_tuned_reduce_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_reduce_scatter_intra_dec_fixed, */
|
||||
ompi_coll_tuned_reduce_intra_dec_fixed,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_reduce_scatter_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scan_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_scan_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatter_intra_dec_fixed, */
|
||||
/* ompi_coll_tuned_scatter_intra_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatterv_intra_dec_fixed */
|
||||
/* ompi_coll_tuned_scatterv_intra_dec_fixed */
|
||||
NULL
|
||||
};
|
||||
|
||||
static const mca_coll_base_module_1_0_0_t intra_dynamic = {
|
||||
|
||||
/* Initialization / finalization functions */
|
||||
/* Initialization / finalization functions */
|
||||
|
||||
ompi_coll_tuned_module_init,
|
||||
ompi_coll_tuned_module_finalize,
|
||||
ompi_coll_tuned_module_init,
|
||||
ompi_coll_tuned_module_finalize,
|
||||
|
||||
/* Collective function pointers */
|
||||
/* Collective function pointers */
|
||||
|
||||
/* ompi_coll_tuned_allgather_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allgather_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allgatherv_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allgatherv_intra_dec_dynamic, */
|
||||
NULL,
|
||||
ompi_coll_tuned_allreduce_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_alltoall_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_alltoallv_intra_dec_dynamic, */
|
||||
ompi_coll_tuned_allreduce_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_alltoall_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_alltoallv_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallw_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_alltoallw_intra_dec_dynamic, */
|
||||
NULL,
|
||||
ompi_coll_tuned_barrier_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_bcast_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_exscan_intra_dec_dynamic, */
|
||||
ompi_coll_tuned_barrier_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
ompi_coll_tuned_bcast_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_exscan_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gather_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_gather_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gatherv_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_gatherv_intra_dec_dynamic, */
|
||||
NULL,
|
||||
ompi_coll_tuned_reduce_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_reduce_scatter_intra_dec_dynamic, */
|
||||
ompi_coll_tuned_reduce_intra_dec_dynamic,
|
||||
/* NULL, */
|
||||
/* ompi_coll_tuned_reduce_scatter_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scan_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_scan_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatter_intra_dec_dynamic, */
|
||||
/* ompi_coll_tuned_scatter_intra_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatterv_intra_dec_dynamic */
|
||||
/* ompi_coll_tuned_scatterv_intra_dec_dynamic */
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -137,87 +137,87 @@ static const mca_coll_base_module_1_0_0_t intra_dynamic = {
|
||||
*/
|
||||
static const mca_coll_base_module_1_0_0_t inter_fixed = {
|
||||
|
||||
/* Initialization / finalization functions */
|
||||
/* Initialization / finalization functions */
|
||||
|
||||
ompi_coll_tuned_module_init,
|
||||
ompi_coll_tuned_module_finalize,
|
||||
ompi_coll_tuned_module_init,
|
||||
ompi_coll_tuned_module_finalize,
|
||||
|
||||
/* Collective function pointers */
|
||||
/* Collective function pointers */
|
||||
|
||||
/* ompi_coll_tuned_allgather_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_allgather_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allgatherv_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_allgatherv_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allreduce_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_allreduce_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoall_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_alltoall_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallv_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_alltoallv_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallw_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_alltoallw_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_barrier_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_barrier_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_bcast_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_bcast_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_exscan_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_exscan_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gather_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_gather_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gatherv_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_gatherv_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_reduce_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_reduce_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_reduce_scatter_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_reduce_scatter_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scan_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_scan_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatter_inter_dec_fixed, */
|
||||
/* ompi_coll_tuned_scatter_inter_dec_fixed, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatterv_inter_dec_fixed */
|
||||
/* ompi_coll_tuned_scatterv_inter_dec_fixed */
|
||||
NULL
|
||||
};
|
||||
|
||||
static const mca_coll_base_module_1_0_0_t inter_dynamic = {
|
||||
|
||||
/* Initialization / finalization functions */
|
||||
/* Initialization / finalization functions */
|
||||
|
||||
ompi_coll_tuned_module_init,
|
||||
ompi_coll_tuned_module_finalize,
|
||||
ompi_coll_tuned_module_init,
|
||||
ompi_coll_tuned_module_finalize,
|
||||
|
||||
/* Collective function pointers */
|
||||
/* Collective function pointers */
|
||||
|
||||
/* ompi_coll_tuned_allgather_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allgather_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allgatherv_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allgatherv_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_allreduce_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_allreduce_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoall_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_alltoall_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallv_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_alltoallv_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_alltoallw_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_alltoallw_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_barrier_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_barrier_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_bcast_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_bcast_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_exscan_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_exscan_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gather_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_gather_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_gatherv_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_gatherv_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_reduce_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_reduce_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_reduce_scatter_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_reduce_scatter_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scan_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_scan_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatter_inter_dec_dynamic, */
|
||||
/* ompi_coll_tuned_scatter_inter_dec_dynamic, */
|
||||
NULL,
|
||||
/* ompi_coll_tuned_scatterv_inter_dec_dynamic */
|
||||
/* ompi_coll_tuned_scatterv_inter_dec_dynamic */
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -233,7 +233,7 @@ static const mca_coll_base_module_1_0_0_t inter_dynamic = {
|
||||
* required level of thread support.
|
||||
*/
|
||||
int ompi_coll_tuned_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
/* Nothing to do */
|
||||
|
||||
@ -248,38 +248,38 @@ int ompi_coll_tuned_init_query(bool enable_progress_threads,
|
||||
*/
|
||||
const mca_coll_base_module_1_0_0_t *
|
||||
ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority,
|
||||
struct mca_coll_base_comm_t **data)
|
||||
struct mca_coll_base_comm_t **data)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:module_tuned query called"));
|
||||
|
||||
*priority = ompi_coll_tuned_priority;
|
||||
|
||||
/*
|
||||
* Choose whether to use [intra|inter] decision functions
|
||||
* and if using fixed OR dynamic rule sets.
|
||||
* Right now you cannot mix them, maybe later on it can be changed
|
||||
* but this would probably add an extra if and funct call to the path
|
||||
*
|
||||
*/
|
||||
/*
|
||||
* Choose whether to use [intra|inter] decision functions
|
||||
* and if using fixed OR dynamic rule sets.
|
||||
* Right now you cannot mix them, maybe later on it can be changed
|
||||
* but this would probably add an extra if and funct call to the path
|
||||
*
|
||||
*/
|
||||
|
||||
if (OMPI_COMM_IS_INTER(comm)) {
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using inter_dynamic"));
|
||||
to_use = &inter_dynamic;
|
||||
} else {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using inter_fixed"));
|
||||
to_use = &inter_fixed;
|
||||
if (OMPI_COMM_IS_INTER(comm)) {
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using inter_dynamic"));
|
||||
to_use = &inter_dynamic;
|
||||
} else {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using inter_fixed"));
|
||||
to_use = &inter_fixed;
|
||||
}
|
||||
} else { /* is an intra comm */
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_dynamic"));
|
||||
to_use = &intra_dynamic;
|
||||
} else {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_fixed"));
|
||||
to_use = &intra_fixed;
|
||||
}
|
||||
}
|
||||
} else { /* is an intra comm */
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_dynamic"));
|
||||
to_use = &intra_dynamic;
|
||||
} else {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_query using intra_fixed"));
|
||||
to_use = &intra_fixed;
|
||||
}
|
||||
}
|
||||
return to_use;
|
||||
return to_use;
|
||||
}
|
||||
|
||||
|
||||
@ -289,199 +289,199 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority,
|
||||
const struct mca_coll_base_module_1_0_0_t *
|
||||
ompi_coll_tuned_module_init(struct ompi_communicator_t *comm)
|
||||
{
|
||||
int size, rank;
|
||||
struct mca_coll_base_comm_t *data;
|
||||
/* fanout parameters */
|
||||
int rc=0;
|
||||
int i;
|
||||
int size, rank;
|
||||
struct mca_coll_base_comm_t *data;
|
||||
/* fanout parameters */
|
||||
int rc=0;
|
||||
int i;
|
||||
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called."));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init called."));
|
||||
|
||||
/* This routine will become more complex and might have to be */
|
||||
/* broken into more sections/function calls */
|
||||
/* This routine will become more complex and might have to be */
|
||||
/* broken into more sections/function calls */
|
||||
|
||||
/* Order of operations:
|
||||
* alloc memory for nb reqs (in case we fall through)
|
||||
* add decision rules if using dynamic rules
|
||||
* compact rules using communicator size info etc
|
||||
* build first guess cached topologies (might depend on the rules from above)
|
||||
*
|
||||
* then attach all to the communicator and return base module funct ptrs
|
||||
*/
|
||||
/* Order of operations:
|
||||
* alloc memory for nb reqs (in case we fall through)
|
||||
* add decision rules if using dynamic rules
|
||||
* compact rules using communicator size info etc
|
||||
* build first guess cached topologies (might depend on the rules from above)
|
||||
*
|
||||
* then attach all to the communicator and return base module funct ptrs
|
||||
*/
|
||||
|
||||
/* Allocate the data that hangs off the communicator */
|
||||
/* Allocate the data that hangs off the communicator */
|
||||
|
||||
if (OMPI_COMM_IS_INTER(comm)) {
|
||||
size = ompi_comm_remote_size(comm);
|
||||
} else {
|
||||
size = ompi_comm_size(comm);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* we still malloc data as it is used by the TUNED modules
|
||||
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
|
||||
* we place any special info after the default data
|
||||
*
|
||||
* BUT on very large systems we might not be able to allocate all this memory so
|
||||
* we do check a MCA parameter to see if if we should allocate this memory
|
||||
*
|
||||
* The default is set very high
|
||||
*
|
||||
*/
|
||||
|
||||
/* if we within the memory/size limit, allow preallocated data */
|
||||
|
||||
|
||||
if (size<=ompi_coll_tuned_preallocate_memory_comm_size_limit) {
|
||||
data = (mca_coll_base_comm_t*)malloc(sizeof(struct mca_coll_base_comm_t) +
|
||||
(sizeof(ompi_request_t *) * size * 2));
|
||||
|
||||
if (NULL == data) {
|
||||
return NULL;
|
||||
if (OMPI_COMM_IS_INTER(comm)) {
|
||||
size = ompi_comm_remote_size(comm);
|
||||
} else {
|
||||
size = ompi_comm_size(comm);
|
||||
}
|
||||
data->mcct_reqs = (ompi_request_t **) (data + 1);
|
||||
data->mcct_num_reqs = size * 2;
|
||||
}
|
||||
else {
|
||||
data = (mca_coll_base_comm_t*)malloc(sizeof(struct mca_coll_base_comm_t));
|
||||
|
||||
|
||||
/*
|
||||
* we still malloc data as it is used by the TUNED modules
|
||||
* if we don't allocate it and fall back to a BASIC module routine then confuses debuggers
|
||||
* we place any special info after the default data
|
||||
*
|
||||
* BUT on very large systems we might not be able to allocate all this memory so
|
||||
* we do check a MCA parameter to see if if we should allocate this memory
|
||||
*
|
||||
* The default is set very high
|
||||
*
|
||||
*/
|
||||
|
||||
/* if we within the memory/size limit, allow preallocated data */
|
||||
|
||||
|
||||
if (size<=ompi_coll_tuned_preallocate_memory_comm_size_limit) {
|
||||
data = (mca_coll_base_comm_t*)malloc(sizeof(struct mca_coll_base_comm_t) +
|
||||
(sizeof(ompi_request_t *) * size * 2));
|
||||
|
||||
if (NULL == data) {
|
||||
return NULL;
|
||||
}
|
||||
data->mcct_reqs = (ompi_request_t **) NULL;
|
||||
data->mcct_num_reqs = 0;
|
||||
}
|
||||
if (NULL == data) {
|
||||
return NULL;
|
||||
}
|
||||
data->mcct_reqs = (ompi_request_t **) (data + 1);
|
||||
data->mcct_num_reqs = size * 2;
|
||||
}
|
||||
else {
|
||||
data = (mca_coll_base_comm_t*)malloc(sizeof(struct mca_coll_base_comm_t));
|
||||
|
||||
if (NULL == data) {
|
||||
return NULL;
|
||||
}
|
||||
data->mcct_reqs = (ompi_request_t **) NULL;
|
||||
data->mcct_num_reqs = 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* If using dynamic and you are MPI_COMM_WORLD and you want to use a parameter file..
|
||||
* then this effects how much storage space you need
|
||||
* (This is a basic version of what will go into V2)
|
||||
*
|
||||
*/
|
||||
/*
|
||||
* If using dynamic and you are MPI_COMM_WORLD and you want to use a parameter file..
|
||||
* then this effects how much storage space you need
|
||||
* (This is a basic version of what will go into V2)
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
size = ompi_comm_size(comm); /* find size so we can (A) decide if to access the file directly */
|
||||
/* (B) so we can get our very own customised ompi_coll_com_rule_t ptr */
|
||||
/* which only has rules in it for our com size */
|
||||
size = ompi_comm_size(comm); /* find size so we can (A) decide if to access the file directly */
|
||||
/* (B) so we can get our very own customised ompi_coll_com_rule_t ptr */
|
||||
/* which only has rules in it for our com size */
|
||||
|
||||
rank = ompi_comm_rank(comm); /* find rank as only MCW:0 opens any tuned conf files */
|
||||
/* actually if they are below a threadhold, they all open it */
|
||||
/* have to build a collective in here.. but just for MCW.. */
|
||||
/* but we have to make sure we have the same rules everywhere :( */
|
||||
rank = ompi_comm_rank(comm); /* find rank as only MCW:0 opens any tuned conf files */
|
||||
/* actually if they are below a threadhold, they all open it */
|
||||
/* have to build a collective in here.. but just for MCW.. */
|
||||
/* but we have to make sure we have the same rules everywhere :( */
|
||||
|
||||
/* if using dynamic rules make sure all overrides are NULL before we start override anything accidently */
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
/* if using dynamic rules make sure all overrides are NULL before we start override anything accidently */
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
/* base rules */
|
||||
data->all_base_rules = (ompi_coll_alg_rule_t*) NULL;
|
||||
|
||||
/* each collective rule for my com size */
|
||||
for (i=0;i<COLLCOUNT;i++) {
|
||||
data->com_rules[i] = (ompi_coll_com_rule_t*) NULL;
|
||||
data->com_rules[i] = (ompi_coll_com_rule_t*) NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* next dynamic state, recheck all forced rules as well */
|
||||
/* warning, we should check to make sure this is really an INTRA comm here... */
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
/* next dynamic state, recheck all forced rules as well */
|
||||
/* warning, we should check to make sure this is really an INTRA comm here... */
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLREDUCE], &(data->user_forced[ALLREDUCE]));
|
||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALL], &(data->user_forced[ALLTOALL]));
|
||||
/* ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALLV], &(data->user_forced[ALLTOALLV])); */
|
||||
/* ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[ALLTOALLV], &(data->user_forced[ALLTOALLV])); */
|
||||
ompi_coll_tuned_forced_getvalues_barrier (ompi_coll_tuned_forced_params[BARRIER], &(data->user_forced[BARRIER]));
|
||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[BCAST], &(data->user_forced[BCAST]));
|
||||
ompi_coll_tuned_forced_getvalues (ompi_coll_tuned_forced_params[REDUCE], &(data->user_forced[REDUCE]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (&ompi_mpi_comm_world==comm) {
|
||||
if (&ompi_mpi_comm_world==comm) {
|
||||
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
if (ompi_coll_tuned_use_dynamic_rules) {
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init MCW & Dynamic"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init MCW & Dynamic"));
|
||||
|
||||
if (ompi_coll_tuned_dynamic_rules_filename) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Opening [%s]",
|
||||
ompi_coll_tuned_dynamic_rules_filename));
|
||||
rc = ompi_coll_tuned_read_rules_config_file (ompi_coll_tuned_dynamic_rules_filename,
|
||||
&(data->all_base_rules), COLLCOUNT);
|
||||
if (rc>=0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Read %d valid rules\n", rc));
|
||||
/* at this point we all have a base set of rules */
|
||||
/* now we can get our customized communicator sized rule set, for each collective */
|
||||
for (i=0;i<COLLCOUNT;i++) {
|
||||
data->com_rules[i] = ompi_coll_tuned_get_com_rule_ptr (data->all_base_rules, i, size);
|
||||
if (ompi_coll_tuned_dynamic_rules_filename) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Opening [%s]",
|
||||
ompi_coll_tuned_dynamic_rules_filename));
|
||||
rc = ompi_coll_tuned_read_rules_config_file (ompi_coll_tuned_dynamic_rules_filename,
|
||||
&(data->all_base_rules), COLLCOUNT);
|
||||
if (rc>=0) {
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Read %d valid rules\n", rc));
|
||||
/* at this point we all have a base set of rules */
|
||||
/* now we can get our customized communicator sized rule set, for each collective */
|
||||
for (i=0;i<COLLCOUNT;i++) {
|
||||
data->com_rules[i] = ompi_coll_tuned_get_com_rule_ptr (data->all_base_rules, i, size);
|
||||
}
|
||||
}
|
||||
else { /* failed to read config file, thus make sure its a NULL... */
|
||||
data->all_base_rules = (ompi_coll_alg_rule_t*) NULL;
|
||||
}
|
||||
}
|
||||
else { /* failed to read config file, thus make sure its a NULL... */
|
||||
data->all_base_rules = (ompi_coll_alg_rule_t*) NULL;
|
||||
}
|
||||
|
||||
|
||||
} /* end if a config filename exists */
|
||||
} /* end if a config filename exists */
|
||||
|
||||
} /* end if dynamic_rules */
|
||||
} /* end if dynamic_rules */
|
||||
|
||||
} /* end if MCW */
|
||||
} /* end if MCW */
|
||||
|
||||
/* ok, if using dynamic rules, not MCW and we are just any rank and a base set of rules exist.. ref them */
|
||||
/* order of eval is important here, if we are MCW ompi_mpi_comm_world.c_coll_selected_data is NULL still.. */
|
||||
if ((ompi_coll_tuned_use_dynamic_rules)&&(!(&ompi_mpi_comm_world==comm))&&
|
||||
/* ok, if using dynamic rules, not MCW and we are just any rank and a base set of rules exist.. ref them */
|
||||
/* order of eval is important here, if we are MCW ompi_mpi_comm_world.c_coll_selected_data is NULL still.. */
|
||||
if ((ompi_coll_tuned_use_dynamic_rules)&&(!(&ompi_mpi_comm_world==comm))&&
|
||||
((ompi_mpi_comm_world.c_coll_selected_data)->all_base_rules)) {
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init NOT MCW & Dynamic"));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init NOT MCW & Dynamic"));
|
||||
|
||||
/* this will, erm fail if MCW doesn't exist which it should! */
|
||||
data->all_base_rules = (ompi_mpi_comm_world.c_coll_selected_data)->all_base_rules;
|
||||
/* this will, erm fail if MCW doesn't exist which it should! */
|
||||
data->all_base_rules = (ompi_mpi_comm_world.c_coll_selected_data)->all_base_rules;
|
||||
|
||||
/* at this point we all have a base set of rules if they exist atall */
|
||||
/* now we can get our customized communicator sized rule set, for each collective */
|
||||
for (i=0;i<COLLCOUNT;i++) {
|
||||
data->com_rules[i] = ompi_coll_tuned_get_com_rule_ptr (data->all_base_rules, i, size);
|
||||
}
|
||||
}
|
||||
/* at this point we all have a base set of rules if they exist atall */
|
||||
/* now we can get our customized communicator sized rule set, for each collective */
|
||||
for (i=0;i<COLLCOUNT;i++) {
|
||||
data->com_rules[i] = ompi_coll_tuned_get_com_rule_ptr (data->all_base_rules, i, size);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* now for the cached topo functions
|
||||
* guess the initial topologies to use rank 0 as root
|
||||
*/
|
||||
/*
|
||||
* now for the cached topo functions
|
||||
* guess the initial topologies to use rank 0 as root
|
||||
*/
|
||||
|
||||
/* general n fan out tree */
|
||||
data->cached_ntree = ompi_coll_tuned_topo_build_tree (ompi_coll_tuned_init_tree_fanout, comm, 0);
|
||||
data->cached_ntree_root = 0;
|
||||
data->cached_ntree_fanout = ompi_coll_tuned_init_tree_fanout;
|
||||
/* general n fan out tree */
|
||||
data->cached_ntree = ompi_coll_tuned_topo_build_tree (ompi_coll_tuned_init_tree_fanout, comm, 0);
|
||||
data->cached_ntree_root = 0;
|
||||
data->cached_ntree_fanout = ompi_coll_tuned_init_tree_fanout;
|
||||
|
||||
/* binary tree */
|
||||
data->cached_bintree = ompi_coll_tuned_topo_build_tree (2, comm, 0);
|
||||
data->cached_bintree_root = 0;
|
||||
/* binary tree */
|
||||
data->cached_bintree = ompi_coll_tuned_topo_build_tree (2, comm, 0);
|
||||
data->cached_bintree_root = 0;
|
||||
|
||||
/* binomial tree */
|
||||
data->cached_bmtree = ompi_coll_tuned_topo_build_bmtree (comm, 0);
|
||||
data->cached_bmtree_root = 0;
|
||||
/* binomial tree */
|
||||
data->cached_bmtree = ompi_coll_tuned_topo_build_bmtree (comm, 0);
|
||||
data->cached_bmtree_root = 0;
|
||||
|
||||
/*
|
||||
* chains (fanout followed by pipelines)
|
||||
* are more difficuilt as the fan out really really depends on message size [sometimes]..
|
||||
* as size gets larger fan-out gets smaller [usually]
|
||||
*
|
||||
* will probably change how we cache this later, for now a midsize
|
||||
* GEF
|
||||
*/
|
||||
data->cached_chain = ompi_coll_tuned_topo_build_chain (ompi_coll_tuned_init_chain_fanout, comm, 0);
|
||||
data->cached_chain_root = 0;
|
||||
data->cached_chain_fanout = ompi_coll_tuned_init_chain_fanout;
|
||||
/*
|
||||
* chains (fanout followed by pipelines)
|
||||
* are more difficuilt as the fan out really really depends on message size [sometimes]..
|
||||
* as size gets larger fan-out gets smaller [usually]
|
||||
*
|
||||
* will probably change how we cache this later, for now a midsize
|
||||
* GEF
|
||||
*/
|
||||
data->cached_chain = ompi_coll_tuned_topo_build_chain (ompi_coll_tuned_init_chain_fanout, comm, 0);
|
||||
data->cached_chain_root = 0;
|
||||
data->cached_chain_fanout = ompi_coll_tuned_init_chain_fanout;
|
||||
|
||||
/* standard pipeline */
|
||||
data->cached_pipeline = ompi_coll_tuned_topo_build_chain (1, comm, 0);
|
||||
data->cached_pipeline_root = 0;
|
||||
/* standard pipeline */
|
||||
data->cached_pipeline = ompi_coll_tuned_topo_build_chain (1, comm, 0);
|
||||
data->cached_pipeline_root = 0;
|
||||
|
||||
/* All done */
|
||||
/* All done */
|
||||
|
||||
comm->c_coll_selected_data = data;
|
||||
comm->c_coll_selected_data = data;
|
||||
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use"));
|
||||
return to_use;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:module_init Tuned is in use"));
|
||||
return to_use;
|
||||
}
|
||||
|
||||
|
||||
@ -490,48 +490,48 @@ ompi_coll_tuned_module_init(struct ompi_communicator_t *comm)
|
||||
*/
|
||||
int ompi_coll_tuned_module_finalize(struct ompi_communicator_t *comm)
|
||||
{
|
||||
if (NULL == comm->c_coll_selected_module) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
if (NULL == comm->c_coll_selected_module) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
/* Reset the reqs to NULL/0 -- they'll be freed as part of freeing
|
||||
the generel c_coll_selected_data */
|
||||
/* Reset the reqs to NULL/0 -- they'll be freed as part of freeing
|
||||
the generel c_coll_selected_data */
|
||||
|
||||
comm->c_coll_selected_data->mcct_reqs = NULL;
|
||||
comm->c_coll_selected_data->mcct_num_reqs = 0;
|
||||
comm->c_coll_selected_data->mcct_reqs = NULL;
|
||||
comm->c_coll_selected_data->mcct_num_reqs = 0;
|
||||
#endif
|
||||
|
||||
/* free any cached information that has been allocated */
|
||||
if (comm->c_coll_selected_data->cached_ntree) { /* destroy general tree if defined */
|
||||
/* free any cached information that has been allocated */
|
||||
if (comm->c_coll_selected_data->cached_ntree) { /* destroy general tree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_ntree);
|
||||
}
|
||||
if (comm->c_coll_selected_data->cached_bintree) { /* destroy bintree if defined */
|
||||
}
|
||||
if (comm->c_coll_selected_data->cached_bintree) { /* destroy bintree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_bintree);
|
||||
}
|
||||
if (comm->c_coll_selected_data->cached_bmtree) { /* destroy bmtree if defined */
|
||||
}
|
||||
if (comm->c_coll_selected_data->cached_bmtree) { /* destroy bmtree if defined */
|
||||
ompi_coll_tuned_topo_destroy_tree (&comm->c_coll_selected_data->cached_bmtree);
|
||||
}
|
||||
if (comm->c_coll_selected_data->cached_chain) { /* destroy general chain if defined */
|
||||
}
|
||||
if (comm->c_coll_selected_data->cached_chain) { /* destroy general chain if defined */
|
||||
ompi_coll_tuned_topo_destroy_chain (&comm->c_coll_selected_data->cached_chain);
|
||||
}
|
||||
if (comm->c_coll_selected_data->cached_pipeline) { /* destroy pipeline if defined */
|
||||
}
|
||||
if (comm->c_coll_selected_data->cached_pipeline) { /* destroy pipeline if defined */
|
||||
ompi_coll_tuned_topo_destroy_chain (&comm->c_coll_selected_data->cached_pipeline);
|
||||
}
|
||||
}
|
||||
|
||||
/* if any algorithm rules are cached on the communicator, only free them if its MCW */
|
||||
/* as this is the only place they are allocated by reading the decision configure file */
|
||||
if ((ompi_coll_tuned_use_dynamic_rules)&&(&ompi_mpi_comm_world==comm)) {
|
||||
if (comm->c_coll_selected_data->all_base_rules) {
|
||||
ompi_coll_tuned_free_all_rules (comm->c_coll_selected_data->all_base_rules, COLLCOUNT);
|
||||
}
|
||||
}
|
||||
/* if any algorithm rules are cached on the communicator, only free them if its MCW */
|
||||
/* as this is the only place they are allocated by reading the decision configure file */
|
||||
if ((ompi_coll_tuned_use_dynamic_rules)&&(&ompi_mpi_comm_world==comm)) {
|
||||
if (comm->c_coll_selected_data->all_base_rules) {
|
||||
ompi_coll_tuned_free_all_rules (comm->c_coll_selected_data->all_base_rules, COLLCOUNT);
|
||||
}
|
||||
}
|
||||
|
||||
/* if allocated memory free it */
|
||||
if (comm->c_coll_selected_data) {
|
||||
free(comm->c_coll_selected_data);
|
||||
comm->c_coll_selected_data = NULL;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
/* if allocated memory free it */
|
||||
if (comm->c_coll_selected_data) {
|
||||
free(comm->c_coll_selected_data);
|
||||
comm->c_coll_selected_data = NULL;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -37,9 +37,9 @@
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||
ompi_datatype_t* datatype, ompi_op_t* op,
|
||||
int root, ompi_communicator_t* comm, uint32_t segsize,
|
||||
int fanout)
|
||||
ompi_datatype_t* datatype, ompi_op_t* op,
|
||||
int root, ompi_communicator_t* comm, uint32_t segsize,
|
||||
int fanout)
|
||||
{
|
||||
int ret, line, rank, size, i = 0;
|
||||
int recvcount, sendcount, prevcount, inbi, previnbi;
|
||||
@ -96,10 +96,10 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||
}
|
||||
realsegsize = segcount * ext;
|
||||
|
||||
/* printf("rank %d root %d count %d \t\t segsize %d typesize %d typeext %d realsegsize %d segcount %d num_segments %d\n", */
|
||||
/* rank, root, count, segsize, typelng, ext, realsegsize, segcount, num_segments); */
|
||||
/* printf("rank %d root %d count %d \t\t segsize %d typesize %d typeext %d realsegsize %d segcount %d num_segments %d\n", */
|
||||
/* rank, root, count, segsize, typelng, ext, realsegsize, segcount, num_segments); */
|
||||
|
||||
/* ompi_coll_tuned_topo_dump_chain (chain, rank); */
|
||||
/* ompi_coll_tuned_topo_dump_chain (chain, rank); */
|
||||
|
||||
|
||||
if (sendbuf != MPI_IN_PLACE) {
|
||||
@ -111,10 +111,10 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||
|
||||
/* handle special case when size == 1 */
|
||||
if (1 == size ) {
|
||||
if (sendbuf != MPI_IN_PLACE) {
|
||||
ompi_ddt_copy_content_same_ddt( datatype, count, (char*)recvbuf, (char*)sendbuf );
|
||||
}
|
||||
return MPI_SUCCESS;
|
||||
if (sendbuf != MPI_IN_PLACE) {
|
||||
ompi_ddt_copy_content_same_ddt( datatype, count, (char*)recvbuf, (char*)sendbuf );
|
||||
}
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* handle non existant recv buffer (i.e. its NULL.. like basic allreduce uses!) */
|
||||
@ -173,26 +173,26 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||
/* check for root might not be needed as it should be checked higher up */
|
||||
if ((MPI_IN_PLACE==sendbuf)&&(rank==root)) {
|
||||
ret = MCA_PML_CALL(irecv(inbuf[inbi],
|
||||
recvcount,datatype,
|
||||
chain->chain_next[i],
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
comm, &reqs[inbi]));
|
||||
recvcount,datatype,
|
||||
chain->chain_next[i],
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
comm, &reqs[inbi]));
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
} else {
|
||||
ret = MCA_PML_CALL(irecv(accumbuf+segindex*realsegsize,
|
||||
recvcount,datatype,
|
||||
chain->chain_next[i],
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
comm, &reqs[inbi]));
|
||||
recvcount,datatype,
|
||||
chain->chain_next[i],
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
comm, &reqs[inbi]));
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
} /* if first segment */
|
||||
else { /* perform a irecv into the standard inbuf */
|
||||
ret = MCA_PML_CALL(irecv(inbuf[inbi],recvcount,datatype,
|
||||
chain->chain_next[i],
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
comm, &reqs[inbi]));
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
ret = MCA_PML_CALL(irecv(inbuf[inbi],recvcount,datatype,
|
||||
chain->chain_next[i],
|
||||
MCA_COLL_BASE_TAG_REDUCE,
|
||||
comm, &reqs[inbi]));
|
||||
if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
|
||||
}
|
||||
|
||||
}
|
||||
@ -255,11 +255,11 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||
} /* end of for each segment */
|
||||
|
||||
/* clean up */
|
||||
/* if (inbuf!=NULL) { */
|
||||
if (inbuf[0] != NULL) free(inbuf[0]);
|
||||
if (inbuf[1] != NULL) free(inbuf[1]);
|
||||
if (allocedaccumbuf) free(accumbuf);
|
||||
/* } */
|
||||
/* if (inbuf!=NULL) { */
|
||||
if (inbuf[0] != NULL) free(inbuf[0]);
|
||||
if (inbuf[1] != NULL) free(inbuf[1]);
|
||||
if (allocedaccumbuf) free(accumbuf);
|
||||
/* } */
|
||||
}
|
||||
|
||||
/* leaf nodes */
|
||||
@ -280,19 +280,19 @@ int ompi_coll_tuned_reduce_intra_chain( void *sendbuf, void *recvbuf, int count,
|
||||
/* error handler */
|
||||
error_hndl:
|
||||
OPAL_OUTPUT (( ompi_coll_tuned_stream, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret ));
|
||||
/* if( inbuf != NULL ) { */
|
||||
if( inbuf[0] != NULL ) free(inbuf[0]);
|
||||
if( inbuf[1] != NULL ) free(inbuf[1]);
|
||||
if (allocedaccumbuf) free(accumbuf);
|
||||
/* } */
|
||||
/* if( inbuf != NULL ) { */
|
||||
if( inbuf[0] != NULL ) free(inbuf[0]);
|
||||
if( inbuf[1] != NULL ) free(inbuf[1]);
|
||||
if (allocedaccumbuf) free(accumbuf);
|
||||
/* } */
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
|
||||
int count, ompi_datatype_t* datatype,
|
||||
ompi_op_t* op, int root,
|
||||
ompi_communicator_t* comm, uint32_t segsize )
|
||||
int count, ompi_datatype_t* datatype,
|
||||
ompi_op_t* op, int root,
|
||||
ompi_communicator_t* comm, uint32_t segsize )
|
||||
{
|
||||
int rank;
|
||||
|
||||
@ -301,8 +301,8 @@ int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_pipeline rank %d ss %5d", rank, segsize));
|
||||
|
||||
return ompi_coll_tuned_reduce_intra_chain( sendbuf,recvbuf, count,
|
||||
datatype, op, root, comm,
|
||||
segsize, 1 );
|
||||
datatype, op, root, comm,
|
||||
segsize, 1 );
|
||||
}
|
||||
|
||||
|
||||
@ -329,9 +329,9 @@ int ompi_coll_tuned_reduce_intra_pipeline( void *sendbuf, void *recvbuf,
|
||||
*/
|
||||
int
|
||||
ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
int root, struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op,
|
||||
int root, struct ompi_communicator_t *comm)
|
||||
{
|
||||
int i, rank, err, size;
|
||||
ptrdiff_t true_lb, true_extent, lb, extent;
|
||||
@ -356,8 +356,8 @@ ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count,
|
||||
return err;
|
||||
}
|
||||
|
||||
/* see discussion in ompi_coll_basic_reduce_lin_intra about extent and true extend */
|
||||
/* for reducing buffer allocation lengths.... */
|
||||
/* see discussion in ompi_coll_basic_reduce_lin_intra about extent and true extend */
|
||||
/* for reducing buffer allocation lengths.... */
|
||||
|
||||
ompi_ddt_get_extent(dtype, &lb, &extent);
|
||||
ompi_ddt_get_true_extent(dtype, &true_lb, &true_extent);
|
||||
@ -449,88 +449,85 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m
|
||||
int rc;
|
||||
int max_alg = 3;
|
||||
|
||||
ompi_coll_tuned_forced_max_algorithms[REDUCE] = max_alg;
|
||||
ompi_coll_tuned_forced_max_algorithms[REDUCE] = max_alg;
|
||||
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_count",
|
||||
"Number of reduce algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
rc = mca_base_param_reg_int (&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_count",
|
||||
"Number of reduce algorithms available",
|
||||
false, true, max_alg, NULL);
|
||||
|
||||
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm",
|
||||
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->algorithm_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm",
|
||||
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
mca_param_indices->segsize_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_segmentsize",
|
||||
"Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
|
||||
false, false, 0, NULL);
|
||||
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
mca_param_indices->tree_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_tree_fanout",
|
||||
"Fanout for n-tree used for reduce algorithms. Only has meaning if algorithm is forced and supports n-tree topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_tree_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
|
||||
return (MPI_SUCCESS);
|
||||
mca_param_indices->chain_fanout_param_index = mca_base_param_reg_int(&mca_coll_tuned_component.super.collm_version,
|
||||
"reduce_algorithm_chain_fanout",
|
||||
"Fanout for chains used for reduce algorithms. Only has meaning if algorithm is forced and supports chain topo based operation.",
|
||||
false, false,
|
||||
ompi_coll_tuned_init_chain_fanout, /* get system wide default */
|
||||
NULL);
|
||||
return (MPI_SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_reduce_intra_do_forced(void *sbuf, void* rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, int root,
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].algorithm));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced selected algorithm %d",
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].algorithm));
|
||||
|
||||
switch (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
|
||||
switch (comm->c_coll_selected_data->user_forced[REDUCE].algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
|
||||
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
|
||||
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].segsize,
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].chain_fanout);
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].segsize,
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].chain_fanout);
|
||||
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm,
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].segsize);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_forced attempt to select algorithm %d when only 0-%d is valid?",
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
|
||||
comm->c_coll_selected_data->user_forced[REDUCE].algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
||||
int ompi_coll_tuned_reduce_intra_do_this(void *sbuf, void* rbuf, int count,
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
int algorithm, int faninout, int segsize)
|
||||
struct ompi_datatype_t *dtype,
|
||||
struct ompi_op_t *op, int root,
|
||||
struct ompi_communicator_t *comm,
|
||||
int algorithm, int faninout, int segsize)
|
||||
{
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
|
||||
algorithm, faninout, segsize));
|
||||
|
||||
switch (algorithm) {
|
||||
switch (algorithm) {
|
||||
case (0): return ompi_coll_tuned_reduce_intra_dec_fixed (sbuf, rbuf, count, dtype, op, root, comm);
|
||||
case (1): return ompi_coll_tuned_reduce_intra_basic_linear (sbuf, rbuf, count, dtype, op, root, comm);
|
||||
case (2): return ompi_coll_tuned_reduce_intra_chain (sbuf, rbuf, count, dtype, op, root, comm,
|
||||
segsize, faninout);
|
||||
segsize, faninout);
|
||||
case (3): return ompi_coll_tuned_reduce_intra_pipeline (sbuf, rbuf, count, dtype, op, root, comm,
|
||||
segsize);
|
||||
segsize);
|
||||
default:
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
|
||||
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));
|
||||
return (MPI_ERR_ARG);
|
||||
} /* switch */
|
||||
|
||||
}
|
||||
|
||||
|
@ -68,8 +68,8 @@ static int calculate_num_nodes_up_to_level( int fanout, int level )
|
||||
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_tree( int fanout,
|
||||
struct ompi_communicator_t* comm,
|
||||
int root )
|
||||
struct ompi_communicator_t* comm,
|
||||
int root )
|
||||
{
|
||||
int rank, size;
|
||||
int schild, sparent;
|
||||
@ -187,7 +187,7 @@ int ompi_coll_tuned_topo_destroy_tree( ompi_coll_tree_t** tree )
|
||||
|
||||
ompi_coll_tree_t*
|
||||
ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
int root )
|
||||
int root )
|
||||
{
|
||||
int childs = 0;
|
||||
int rank;
|
||||
@ -256,8 +256,8 @@ ompi_coll_tuned_topo_build_bmtree( struct ompi_communicator_t* comm,
|
||||
|
||||
ompi_coll_chain_t*
|
||||
ompi_coll_tuned_topo_build_chain( int fanout,
|
||||
struct ompi_communicator_t* comm,
|
||||
int root )
|
||||
struct ompi_communicator_t* comm,
|
||||
int root )
|
||||
{
|
||||
int rank, size;
|
||||
int srank; /* shifted rank */
|
||||
@ -428,23 +428,23 @@ int ompi_coll_tuned_topo_destroy_chain( ompi_coll_chain_t** chain )
|
||||
|
||||
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
|
||||
{
|
||||
int i;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:topo_dump_tree %1d tree root %d fanout %d BM %1d nextsize %d prev %d", rank,
|
||||
tree->tree_root, tree->tree_bmtree, tree->tree_fanout, tree->tree_nextsize, tree->tree_prev));
|
||||
if (tree->tree_nextsize) {
|
||||
for (i=0;i<tree->tree_nextsize;i++) OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d", i, tree->tree_next[i]));
|
||||
}
|
||||
return (0);
|
||||
int i;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:topo_dump_tree %1d tree root %d fanout %d BM %1d nextsize %d prev %d", rank,
|
||||
tree->tree_root, tree->tree_bmtree, tree->tree_fanout, tree->tree_nextsize, tree->tree_prev));
|
||||
if (tree->tree_nextsize) {
|
||||
for (i=0;i<tree->tree_nextsize;i++) OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d", i, tree->tree_next[i]));
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_topo_dump_chain (ompi_coll_chain_t* chain, int rank)
|
||||
{
|
||||
int i;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:topo_dump_chain %1d chain root %d fanout %d nextsize %d prev %d\n", rank,
|
||||
chain->chain_root, chain->chain_numchain, chain->chain_nextsize, chain->chain_prev));
|
||||
if (chain->chain_nextsize) {
|
||||
for (i=0;i<chain->chain_nextsize;i++) OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d ", i, chain->chain_next[i]));
|
||||
}
|
||||
return (0);
|
||||
int i;
|
||||
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:topo:topo_dump_chain %1d chain root %d fanout %d nextsize %d prev %d\n", rank,
|
||||
chain->chain_root, chain->chain_numchain, chain->chain_nextsize, chain->chain_prev));
|
||||
if (chain->chain_nextsize) {
|
||||
for (i=0;i<chain->chain_nextsize;i++) OPAL_OUTPUT((ompi_coll_tuned_stream,"[%1d] %d ", i, chain->chain_next[i]));
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
@ -65,8 +65,6 @@ int ompi_coll_tuned_topo_destroy_chain( ompi_coll_chain_t** chain );
|
||||
int ompi_coll_tuned_topo_dump_tree (ompi_coll_tree_t* tree, int rank);
|
||||
int ompi_coll_tuned_topo_dump_chain (ompi_coll_chain_t* chain, int rank);
|
||||
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -29,16 +29,16 @@
|
||||
#include "coll_tuned_util.h"
|
||||
|
||||
int ompi_coll_tuned_sendrecv_actual( void* sendbuf, int scount, ompi_datatype_t* sdatatype,
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status )
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status )
|
||||
|
||||
{ /* post receive first, then send, then waitall... should be fast (I hope) */
|
||||
int err, line = 0;
|
||||
ompi_request_t* reqs[2];
|
||||
ompi_status_public_t statuses[2];
|
||||
int err, line = 0;
|
||||
ompi_request_t* reqs[2];
|
||||
ompi_status_public_t statuses[2];
|
||||
|
||||
/* post new irecv */
|
||||
err = MCA_PML_CALL(irecv( recvbuf, rcount, rdatatype, source, rtag, comm, &reqs[0]));
|
||||
@ -68,14 +68,14 @@ ompi_status_public_t statuses[2];
|
||||
*/
|
||||
|
||||
int ompi_coll_tuned_sendrecv_actual_localcompleted (
|
||||
void* sendbuf, int scount, ompi_datatype_t* sdatatype, int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype, int source, int rtag,
|
||||
struct ompi_communicator_t* comm, ompi_status_public_t* status )
|
||||
void* sendbuf, int scount, ompi_datatype_t* sdatatype, int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype, int source, int rtag,
|
||||
struct ompi_communicator_t* comm, ompi_status_public_t* status )
|
||||
|
||||
{ /* post receive first, then [local] sync send, then wait... should be fast (I hope) */
|
||||
int err, line = 0;
|
||||
ompi_request_t* req;
|
||||
ompi_status_public_t tmpstatus;
|
||||
int err, line = 0;
|
||||
ompi_request_t* req;
|
||||
ompi_status_public_t tmpstatus;
|
||||
|
||||
/* post new irecv */
|
||||
err = MCA_PML_CALL(irecv( recvbuf, rcount, rdatatype, source, rtag, comm, &req));
|
||||
@ -98,3 +98,4 @@ ompi_status_public_t tmpstatus;
|
||||
OPAL_OUTPUT ((ompi_coll_tuned_stream, "%s:%d: Error %d occurred\n",__FILE__,line,err));
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
@ -34,55 +34,51 @@ extern "C" {
|
||||
|
||||
/* prototypes */
|
||||
int ompi_coll_tuned_sendrecv_actual( void* sendbuf, int scount, ompi_datatype_t* sdatatype,
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status );
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status );
|
||||
|
||||
|
||||
/* inline functions */
|
||||
|
||||
static inline int ompi_coll_tuned_sendrecv( void* sendbuf, int scount, ompi_datatype_t* sdatatype,
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status, int myid )
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status, int myid )
|
||||
{
|
||||
if ((dest==myid)&&(source==myid)) {
|
||||
return (int) ompi_ddt_sndrcv(sendbuf, (int32_t) scount, sdatatype, recvbuf, (int32_t) rcount, rdatatype);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_sendrecv_actual (sendbuf, scount, sdatatype, dest, stag, recvbuf, rcount, rdatatype,
|
||||
source, rtag, comm, status);
|
||||
}
|
||||
return ompi_coll_tuned_sendrecv_actual (sendbuf, scount, sdatatype, dest, stag, recvbuf, rcount, rdatatype,
|
||||
source, rtag, comm, status);
|
||||
}
|
||||
|
||||
int ompi_coll_tuned_sendrecv_actual_localcompleted( void* sendbuf, int scount, ompi_datatype_t* sdatatype,
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status );
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status );
|
||||
|
||||
|
||||
/* inline functions */
|
||||
|
||||
static inline int ompi_coll_tuned_sendrecv_localcompleted( void* sendbuf, int scount, ompi_datatype_t* sdatatype,
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status, int myid )
|
||||
int dest, int stag,
|
||||
void* recvbuf, int rcount, ompi_datatype_t* rdatatype,
|
||||
int source, int rtag,
|
||||
struct ompi_communicator_t* comm,
|
||||
ompi_status_public_t* status, int myid )
|
||||
{
|
||||
if ((dest==myid)&&(source==myid)) {
|
||||
return (int) ompi_ddt_sndrcv(sendbuf, (int32_t) scount, sdatatype, recvbuf, (int32_t) rcount, rdatatype);
|
||||
}
|
||||
else {
|
||||
return ompi_coll_tuned_sendrecv_actual_localcompleted (sendbuf, scount, sdatatype, dest, stag, recvbuf, rcount, rdatatype,
|
||||
source, rtag, comm, status);
|
||||
}
|
||||
return ompi_coll_tuned_sendrecv_actual_localcompleted (sendbuf, scount, sdatatype, dest, stag, recvbuf, rcount, rdatatype,
|
||||
source, rtag, comm, status);
|
||||
}
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user