coll/libnbc: add recursive doubling algorithm for MPI_Iallreduce
Signed-off-by: Alex Anenkov <anenkov.ru@gmail.com>
Этот коммит содержится в:
родитель
163bbd4f04
Коммит
77d466edf3
@ -60,6 +60,7 @@ static mca_base_var_enum_value_t iallreduce_algorithms[] = {
|
|||||||
{1, "ring"},
|
{1, "ring"},
|
||||||
{2, "binomial"},
|
{2, "binomial"},
|
||||||
{3, "rabenseifner"},
|
{3, "rabenseifner"},
|
||||||
|
{4, "recursive_doubling"},
|
||||||
{0, NULL}
|
{0, NULL}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -225,7 +226,7 @@ libnbc_register(void)
|
|||||||
(void) mca_base_var_enum_create("coll_libnbc_iallreduce_algorithms", iallreduce_algorithms, &new_enum);
|
(void) mca_base_var_enum_create("coll_libnbc_iallreduce_algorithms", iallreduce_algorithms, &new_enum);
|
||||||
mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version,
|
mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version,
|
||||||
"iallreduce_algorithm",
|
"iallreduce_algorithm",
|
||||||
"Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner",
|
"Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner, 4 recursive_doubling",
|
||||||
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
|
||||||
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL,
|
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL,
|
||||||
&libnbc_iallreduce_algorithm);
|
&libnbc_iallreduce_algorithm);
|
||||||
|
@ -28,6 +28,9 @@
|
|||||||
|
|
||||||
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, const void *sendbuf,
|
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, const void *sendbuf,
|
||||||
void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf);
|
void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf);
|
||||||
|
static inline int allred_sched_recursivedoubling(int rank, int p, const void *sendbuf, void *recvbuf,
|
||||||
|
int count, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op,
|
||||||
|
char inplace, NBC_Schedule *schedule, void *tmpbuf);
|
||||||
static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf,
|
static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf,
|
||||||
void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule,
|
void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule,
|
||||||
void *tmpbuf);
|
void *tmpbuf);
|
||||||
@ -69,7 +72,7 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
|
|||||||
#ifdef NBC_CACHE_SCHEDULE
|
#ifdef NBC_CACHE_SCHEDULE
|
||||||
NBC_Allreduce_args *args, *found, search;
|
NBC_Allreduce_args *args, *found, search;
|
||||||
#endif
|
#endif
|
||||||
enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER } alg;
|
enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER, NBC_ARED_RDBL } alg;
|
||||||
char inplace;
|
char inplace;
|
||||||
void *tmpbuf = NULL;
|
void *tmpbuf = NULL;
|
||||||
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
|
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
|
||||||
@ -124,9 +127,11 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
|
|||||||
alg = NBC_ARED_RING;
|
alg = NBC_ARED_RING;
|
||||||
else if (libnbc_iallreduce_algorithm == 2)
|
else if (libnbc_iallreduce_algorithm == 2)
|
||||||
alg = NBC_ARED_BINOMIAL;
|
alg = NBC_ARED_BINOMIAL;
|
||||||
else if (libnbc_iallreduce_algorithm == 3 && count >= nprocs_pof2 && ompi_op_is_commute(op)) {
|
else if (libnbc_iallreduce_algorithm == 3 && count >= nprocs_pof2 && ompi_op_is_commute(op))
|
||||||
alg = NBC_ARED_REDSCAT_ALLGATHER;
|
alg = NBC_ARED_REDSCAT_ALLGATHER;
|
||||||
} else
|
else if (libnbc_iallreduce_algorithm == 4)
|
||||||
|
alg = NBC_ARED_RDBL;
|
||||||
|
else
|
||||||
alg = NBC_ARED_RING;
|
alg = NBC_ARED_RING;
|
||||||
}
|
}
|
||||||
#ifdef NBC_CACHE_SCHEDULE
|
#ifdef NBC_CACHE_SCHEDULE
|
||||||
@ -159,6 +164,9 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
|
|||||||
case NBC_ARED_RING:
|
case NBC_ARED_RING:
|
||||||
res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, tmpbuf);
|
res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, tmpbuf);
|
||||||
break;
|
break;
|
||||||
|
case NBC_ARED_RDBL:
|
||||||
|
res = allred_sched_recursivedoubling(rank, p, sendbuf, recvbuf, count, datatype, gap, op, inplace, schedule, tmpbuf);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -470,6 +478,161 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
|
|||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* allred_sched_recursivedoubling
|
||||||
|
*
|
||||||
|
* Function: Recursive doubling algorithm for iallreduce operation
|
||||||
|
*
|
||||||
|
* Description: Implements recursive doubling algorithm for iallreduce.
|
||||||
|
* The algorithm preserves order of operations so it can
|
||||||
|
* be used both by commutative and non-commutative operations.
|
||||||
|
* Schedule length: O(\log(p))
|
||||||
|
* Memory requirements:
|
||||||
|
* Each process requires a temporary buffer: count * typesize = O(count)
|
||||||
|
*
|
||||||
|
* Example on 7 nodes:
|
||||||
|
* Initial state
|
||||||
|
* # 0 1 2 3 4 5 6
|
||||||
|
* [0] [1] [2] [3] [4] [5] [6]
|
||||||
|
* Initial adjustment step for non-power of two nodes.
|
||||||
|
* old rank 1 3 5 6
|
||||||
|
* new rank 0 1 2 3
|
||||||
|
* [0+1] [2+3] [4+5] [6]
|
||||||
|
* Step 1
|
||||||
|
* old rank 1 3 5 6
|
||||||
|
* new rank 0 1 2 3
|
||||||
|
* [0+1+] [0+1+] [4+5+] [4+5+]
|
||||||
|
* [2+3+] [2+3+] [6 ] [6 ]
|
||||||
|
* Step 2
|
||||||
|
* old rank 1 3 5 6
|
||||||
|
* new rank 0 1 2 3
|
||||||
|
* [0+1+] [0+1+] [0+1+] [0+1+]
|
||||||
|
* [2+3+] [2+3+] [2+3+] [2+3+]
|
||||||
|
* [4+5+] [4+5+] [4+5+] [4+5+]
|
||||||
|
* [6 ] [6 ] [6 ] [6 ]
|
||||||
|
* Final adjustment step for non-power of two nodes
|
||||||
|
* # 0 1 2 3 4 5 6
|
||||||
|
* [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+]
|
||||||
|
* [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+]
|
||||||
|
* [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+]
|
||||||
|
* [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ]
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
static inline int allred_sched_recursivedoubling(int rank, int p, const void *sendbuf, void *recvbuf,
|
||||||
|
int count, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op,
|
||||||
|
char inplace, NBC_Schedule *schedule, void *tmpbuf)
|
||||||
|
{
|
||||||
|
int res, pof2, nprocs_rem, vrank;
|
||||||
|
char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL;
|
||||||
|
|
||||||
|
tmpsend = (char*) tmpbuf - gap;
|
||||||
|
tmprecv = (char*) recvbuf;
|
||||||
|
|
||||||
|
if (inplace) {
|
||||||
|
res = NBC_Sched_copy(recvbuf, false, count, datatype,
|
||||||
|
tmpsend, false, count, datatype, schedule, true);
|
||||||
|
} else {
|
||||||
|
res = NBC_Sched_copy((void *)sendbuf, false, count, datatype,
|
||||||
|
tmpsend, false, count, datatype, schedule, true);
|
||||||
|
}
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
|
||||||
|
/* Get nearest power of two less than or equal to comm size */
|
||||||
|
pof2 = opal_next_poweroftwo(p) >> 1;
|
||||||
|
|
||||||
|
/* Handle non-power-of-two case:
|
||||||
|
- Even ranks less than 2 * nprocs_rem send their data to (rank + 1), and
|
||||||
|
sets new rank to -1.
|
||||||
|
- Odd ranks less than 2 * nprocs_rem receive data from (rank - 1),
|
||||||
|
apply appropriate operation, and set new rank to rank/2
|
||||||
|
- Everyone else sets rank to rank - nprocs_rem
|
||||||
|
*/
|
||||||
|
nprocs_rem = p - pof2;
|
||||||
|
if (rank < 2 * nprocs_rem) {
|
||||||
|
if (0 == rank % 2) { /* Even */
|
||||||
|
res = NBC_Sched_send(tmpsend, false, count, datatype, rank + 1, schedule, true);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
vrank = -1;
|
||||||
|
} else { /* Odd */
|
||||||
|
res = NBC_Sched_recv(tmprecv, false, count, datatype, rank - 1, schedule, true);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
|
||||||
|
/* tmpsend = tmprecv (op) tmpsend */
|
||||||
|
res = NBC_Sched_op(tmprecv, false, tmpsend, false, count, datatype, op, schedule, true);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
|
||||||
|
vrank = rank >> 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
vrank = rank - nprocs_rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Communication/Computation loop
|
||||||
|
- Exchange message with remote node.
|
||||||
|
- Perform appropriate operation taking in account order of operations:
|
||||||
|
result = value (op) result
|
||||||
|
*/
|
||||||
|
if (0 <= vrank) {
|
||||||
|
for (int distance = 1; distance < pof2; distance <<= 1) {
|
||||||
|
int remote = vrank ^ distance;
|
||||||
|
|
||||||
|
/* Find real rank of remote node */
|
||||||
|
if (remote < nprocs_rem) {
|
||||||
|
remote = remote * 2 + 1;
|
||||||
|
} else {
|
||||||
|
remote += nprocs_rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Exchange the data */
|
||||||
|
res = NBC_Sched_send(tmpsend, false, count, datatype, remote, schedule, false);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
|
||||||
|
res = NBC_Sched_recv(tmprecv, false, count, datatype, remote, schedule, true);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
|
||||||
|
/* Apply operation */
|
||||||
|
if (rank < remote) {
|
||||||
|
/* tmprecv = tmpsend (op) tmprecv */
|
||||||
|
res = NBC_Sched_op(tmpsend, false, tmprecv, false,
|
||||||
|
count, datatype, op, schedule, true);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
|
||||||
|
/* Swap tmpsend and tmprecv buffers */
|
||||||
|
tmpswap = tmprecv; tmprecv = tmpsend; tmpsend = tmpswap;
|
||||||
|
} else {
|
||||||
|
/* tmpsend = tmprecv (op) tmpsend */
|
||||||
|
res = NBC_Sched_op(tmprecv, false, tmpsend, false,
|
||||||
|
count, datatype, op, schedule, true);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Handle non-power-of-two case:
|
||||||
|
- Even ranks less than 2 * nprocs_rem receive result from (rank + 1)
|
||||||
|
- Odd ranks less than 2 * nprocs_rem send result from tmpsend to (rank - 1)
|
||||||
|
*/
|
||||||
|
if (rank < 2 * nprocs_rem) {
|
||||||
|
if (0 == rank % 2) { /* Even */
|
||||||
|
res = NBC_Sched_recv(recvbuf, false, count, datatype, rank + 1, schedule, false);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
tmpsend = (char *)recvbuf;
|
||||||
|
} else { /* Odd */
|
||||||
|
res = NBC_Sched_send(tmpsend, false, count, datatype, rank - 1, schedule, false);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy result back into recvbuf */
|
||||||
|
if (tmpsend != recvbuf) {
|
||||||
|
res = NBC_Sched_copy(tmpsend, false, count, datatype,
|
||||||
|
recvbuf, false, count, datatype, schedule, false);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
|
||||||
|
}
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op,
|
static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op,
|
||||||
int size, int ext, NBC_Schedule *schedule, void *tmpbuf) {
|
int size, int ext, NBC_Schedule *schedule, void *tmpbuf) {
|
||||||
int segsize, *segsizes, *segoffsets; /* segment sizes and offsets per segment (number of segments == number of nodes */
|
int segsize, *segsizes, *segoffsets; /* segment sizes and offsets per segment (number of segments == number of nodes */
|
||||||
@ -1044,4 +1207,3 @@ int ompi_coll_libnbc_allreduce_inter_init(const void* sendbuf, void* recvbuf, in
|
|||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user