1
1

Merge pull request #6686 from alex-anenkov/coll-iallreduce-recursivedoubling

coll/libnbc: add recursive doubling algorithm for MPI_Iallreduce
Этот коммит содержится в:
Jeff Squyres 2019-06-10 10:09:51 -04:00 коммит произвёл GitHub
родитель 80e0ac7379 77d466edf3
Коммит 7c3aeb3061
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 168 добавлений и 5 удалений

Просмотреть файл

@ -60,6 +60,7 @@ static mca_base_var_enum_value_t iallreduce_algorithms[] = {
{1, "ring"},
{2, "binomial"},
{3, "rabenseifner"},
{4, "recursive_doubling"},
{0, NULL}
};
@ -225,7 +226,7 @@ libnbc_register(void)
(void) mca_base_var_enum_create("coll_libnbc_iallreduce_algorithms", iallreduce_algorithms, &new_enum);
mca_base_component_var_register(&mca_coll_libnbc_component.super.collm_version,
"iallreduce_algorithm",
"Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner",
"Which iallreduce algorithm is used: 0 ignore, 1 ring, 2 binomial, 3 rabenseifner, 4 recursive_doubling",
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL,
&libnbc_iallreduce_algorithm);

Просмотреть файл

@ -28,6 +28,9 @@
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, const void *sendbuf,
void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf);
static inline int allred_sched_recursivedoubling(int rank, int p, const void *sendbuf, void *recvbuf,
int count, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op,
char inplace, NBC_Schedule *schedule, void *tmpbuf);
static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf,
void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule,
void *tmpbuf);
@ -69,7 +72,7 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
#ifdef NBC_CACHE_SCHEDULE
NBC_Allreduce_args *args, *found, search;
#endif
enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER } alg;
enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER, NBC_ARED_RDBL } alg;
char inplace;
void *tmpbuf = NULL;
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
@ -124,9 +127,11 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
alg = NBC_ARED_RING;
else if (libnbc_iallreduce_algorithm == 2)
alg = NBC_ARED_BINOMIAL;
else if (libnbc_iallreduce_algorithm == 3 && count >= nprocs_pof2 && ompi_op_is_commute(op)) {
else if (libnbc_iallreduce_algorithm == 3 && count >= nprocs_pof2 && ompi_op_is_commute(op))
alg = NBC_ARED_REDSCAT_ALLGATHER;
} else
else if (libnbc_iallreduce_algorithm == 4)
alg = NBC_ARED_RDBL;
else
alg = NBC_ARED_RING;
}
#ifdef NBC_CACHE_SCHEDULE
@ -159,6 +164,9 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
case NBC_ARED_RING:
res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, tmpbuf);
break;
case NBC_ARED_RDBL:
res = allred_sched_recursivedoubling(rank, p, sendbuf, recvbuf, count, datatype, gap, op, inplace, schedule, tmpbuf);
break;
}
}
@ -470,6 +478,161 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
return OMPI_SUCCESS;
}
/*
* allred_sched_recursivedoubling
*
* Function: Recursive doubling algorithm for iallreduce operation
*
* Description: Implements recursive doubling algorithm for iallreduce.
* The algorithm preserves order of operations so it can
* be used both by commutative and non-commutative operations.
* Schedule length: O(\log(p))
* Memory requirements:
* Each process requires a temporary buffer: count * typesize = O(count)
*
* Example on 7 nodes:
* Initial state
* # 0 1 2 3 4 5 6
* [0] [1] [2] [3] [4] [5] [6]
* Initial adjustment step for non-power of two nodes.
* old rank 1 3 5 6
* new rank 0 1 2 3
* [0+1] [2+3] [4+5] [6]
* Step 1
* old rank 1 3 5 6
* new rank 0 1 2 3
* [0+1+] [0+1+] [4+5+] [4+5+]
* [2+3+] [2+3+] [6 ] [6 ]
* Step 2
* old rank 1 3 5 6
* new rank 0 1 2 3
* [0+1+] [0+1+] [0+1+] [0+1+]
* [2+3+] [2+3+] [2+3+] [2+3+]
* [4+5+] [4+5+] [4+5+] [4+5+]
* [6 ] [6 ] [6 ] [6 ]
* Final adjustment step for non-power of two nodes
* # 0 1 2 3 4 5 6
* [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+]
* [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+]
* [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+]
* [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ]
*
*/
static inline int allred_sched_recursivedoubling(int rank, int p, const void *sendbuf, void *recvbuf,
int count, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op,
char inplace, NBC_Schedule *schedule, void *tmpbuf)
{
int res, pof2, nprocs_rem, vrank;
char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL;
tmpsend = (char*) tmpbuf - gap;
tmprecv = (char*) recvbuf;
if (inplace) {
res = NBC_Sched_copy(recvbuf, false, count, datatype,
tmpsend, false, count, datatype, schedule, true);
} else {
res = NBC_Sched_copy((void *)sendbuf, false, count, datatype,
tmpsend, false, count, datatype, schedule, true);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
/* Get nearest power of two less than or equal to comm size */
pof2 = opal_next_poweroftwo(p) >> 1;
/* Handle non-power-of-two case:
- Even ranks less than 2 * nprocs_rem send their data to (rank + 1), and
sets new rank to -1.
- Odd ranks less than 2 * nprocs_rem receive data from (rank - 1),
apply appropriate operation, and set new rank to rank/2
- Everyone else sets rank to rank - nprocs_rem
*/
nprocs_rem = p - pof2;
if (rank < 2 * nprocs_rem) {
if (0 == rank % 2) { /* Even */
res = NBC_Sched_send(tmpsend, false, count, datatype, rank + 1, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
vrank = -1;
} else { /* Odd */
res = NBC_Sched_recv(tmprecv, false, count, datatype, rank - 1, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
/* tmpsend = tmprecv (op) tmpsend */
res = NBC_Sched_op(tmprecv, false, tmpsend, false, count, datatype, op, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
vrank = rank >> 1;
}
} else {
vrank = rank - nprocs_rem;
}
/* Communication/Computation loop
- Exchange message with remote node.
- Perform appropriate operation taking in account order of operations:
result = value (op) result
*/
if (0 <= vrank) {
for (int distance = 1; distance < pof2; distance <<= 1) {
int remote = vrank ^ distance;
/* Find real rank of remote node */
if (remote < nprocs_rem) {
remote = remote * 2 + 1;
} else {
remote += nprocs_rem;
}
/* Exchange the data */
res = NBC_Sched_send(tmpsend, false, count, datatype, remote, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
res = NBC_Sched_recv(tmprecv, false, count, datatype, remote, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
/* Apply operation */
if (rank < remote) {
/* tmprecv = tmpsend (op) tmprecv */
res = NBC_Sched_op(tmpsend, false, tmprecv, false,
count, datatype, op, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
/* Swap tmpsend and tmprecv buffers */
tmpswap = tmprecv; tmprecv = tmpsend; tmpsend = tmpswap;
} else {
/* tmpsend = tmprecv (op) tmpsend */
res = NBC_Sched_op(tmprecv, false, tmpsend, false,
count, datatype, op, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
}
}
}
/* Handle non-power-of-two case:
- Even ranks less than 2 * nprocs_rem receive result from (rank + 1)
- Odd ranks less than 2 * nprocs_rem send result from tmpsend to (rank - 1)
*/
if (rank < 2 * nprocs_rem) {
if (0 == rank % 2) { /* Even */
res = NBC_Sched_recv(recvbuf, false, count, datatype, rank + 1, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
tmpsend = (char *)recvbuf;
} else { /* Odd */
res = NBC_Sched_send(tmpsend, false, count, datatype, rank - 1, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
}
}
/* Copy result back into recvbuf */
if (tmpsend != recvbuf) {
res = NBC_Sched_copy(tmpsend, false, count, datatype,
recvbuf, false, count, datatype, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; }
}
return OMPI_SUCCESS;
}
static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op,
int size, int ext, NBC_Schedule *schedule, void *tmpbuf) {
int segsize, *segsizes, *segoffsets; /* segment sizes and offsets per segment (number of segments == number of nodes */
@ -1044,4 +1207,3 @@ int ompi_coll_libnbc_allreduce_inter_init(const void* sendbuf, void* recvbuf, in
return OMPI_SUCCESS;
}