1
1
- correctly handle non commutative operators
 - correctly handle non zero lower bound ddt
 - correctly handle ddt with size > extent
 - revamp NBC_Sched_op so it takes two buffers and matches ompi_op_reduce semantic
 - various fix for inter communicators

Thanks Yuki Matsumoto for the report
Этот коммит содержится в:
Gilles Gouaillardet 2016-07-06 08:57:00 +09:00
родитель 3e559a14a9
Коммит 678d08647b
8 изменённых файлов: 367 добавлений и 250 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* rights reserved. * rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2015 Research Organization for Information Science * Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* *
* Author(s): Torsten Hoefler <htor@cs.indiana.edu> * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
@ -111,7 +111,7 @@ static int nbc_schedule_round_append (NBC_Schedule *schedule, void *data, int da
} }
/* this function puts a send into the schedule */ /* this function puts a send into the schedule */
int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) { static int NBC_Sched_send_internal (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, bool local, NBC_Schedule *schedule, bool barrier) {
NBC_Args_send send_args; NBC_Args_send send_args;
int ret; int ret;
@ -122,6 +122,7 @@ int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype dataty
send_args.count = count; send_args.count = count;
send_args.datatype = datatype; send_args.datatype = datatype;
send_args.dest = dest; send_args.dest = dest;
send_args.local = local;
/* append to the round-schedule */ /* append to the round-schedule */
ret = nbc_schedule_round_append (schedule, &send_args, sizeof (send_args), barrier); ret = nbc_schedule_round_append (schedule, &send_args, sizeof (send_args), barrier);
@ -134,8 +135,16 @@ int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype dataty
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, false, schedule, barrier);
}
int NBC_Sched_local_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) {
return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, true, schedule, barrier);
}
/* this function puts a receive into the schedule */ /* this function puts a receive into the schedule */
int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) { static int NBC_Sched_recv_internal (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, bool local, NBC_Schedule *schedule, bool barrier) {
NBC_Args_recv recv_args; NBC_Args_recv recv_args;
int ret; int ret;
@ -146,6 +155,7 @@ int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, in
recv_args.count = count; recv_args.count = count;
recv_args.datatype = datatype; recv_args.datatype = datatype;
recv_args.source = source; recv_args.source = source;
recv_args.local = local;
/* append to the round-schedule */ /* append to the round-schedule */
ret = nbc_schedule_round_append (schedule, &recv_args, sizeof (recv_args), barrier); ret = nbc_schedule_round_append (schedule, &recv_args, sizeof (recv_args), barrier);
@ -158,8 +168,16 @@ int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, in
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, false, schedule, barrier);
}
int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) {
return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, true, schedule, barrier);
}
/* this function puts an operation into the schedule */ /* this function puts an operation into the schedule */
int NBC_Sched_op (void *buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype,
MPI_Op op, NBC_Schedule *schedule, bool barrier) { MPI_Op op, NBC_Schedule *schedule, bool barrier) {
NBC_Args_op op_args; NBC_Args_op op_args;
int ret; int ret;
@ -168,10 +186,8 @@ int NBC_Sched_op (void *buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void
op_args.type = OP; op_args.type = OP;
op_args.buf1 = buf1; op_args.buf1 = buf1;
op_args.buf2 = buf2; op_args.buf2 = buf2;
op_args.buf3 = buf3;
op_args.tmpbuf1 = tmpbuf1; op_args.tmpbuf1 = tmpbuf1;
op_args.tmpbuf2 = tmpbuf2; op_args.tmpbuf2 = tmpbuf2;
op_args.tmpbuf3 = tmpbuf3;
op_args.count = count; op_args.count = count;
op_args.op = op; op_args.op = op;
op_args.datatype = datatype; op_args.datatype = datatype;
@ -182,7 +198,7 @@ int NBC_Sched_op (void *buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void
return ret; return ret;
} }
NBC_DEBUG(10, "added op - ends at byte %i\n", nbc_schedule_get_size (schedule)); NBC_DEBUG(10, "added op2 - ends at byte %i\n", nbc_schedule_get_size (schedule));
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -373,7 +389,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) {
NBC_Args_op opargs; NBC_Args_op opargs;
NBC_Args_copy copyargs; NBC_Args_copy copyargs;
NBC_Args_unpack unpackargs; NBC_Args_unpack unpackargs;
void *buf1, *buf2, *buf3; void *buf1, *buf2;
/* get round-schedule address */ /* get round-schedule address */
ptr = handle->schedule->data + handle->row_offset; ptr = handle->schedule->data + handle->row_offset;
@ -410,7 +426,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) {
handle->req_array = tmp; handle->req_array = tmp;
res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag,
MCA_PML_BASE_SEND_STANDARD, handle->comm, MCA_PML_BASE_SEND_STANDARD, sendargs.local?handle->comm->c_local_comm:handle->comm,
handle->req_array+handle->req_count - 1)); handle->req_array+handle->req_count - 1));
if (OMPI_SUCCESS != res) { if (OMPI_SUCCESS != res) {
NBC_Error ("Error in MPI_Isend(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count, NBC_Error ("Error in MPI_Isend(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count,
@ -444,7 +460,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) {
handle->req_array = tmp; handle->req_array = tmp;
res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, handle->comm, res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, recvargs.local?handle->comm->c_local_comm:handle->comm,
handle->req_array+handle->req_count-1)); handle->req_array+handle->req_count-1));
if (OMPI_SUCCESS != res) { if (OMPI_SUCCESS != res) {
NBC_Error("Error in MPI_Irecv(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count, NBC_Error("Error in MPI_Irecv(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count,
@ -456,10 +472,10 @@ static inline int NBC_Start_round(NBC_Handle *handle) {
#endif #endif
break; break;
case OP: case OP:
NBC_DEBUG(5, " OP (offset %li) ", offset); NBC_DEBUG(5, " OP2 (offset %li) ", offset);
NBC_GET_BYTES(ptr,opargs); NBC_GET_BYTES(ptr,opargs);
NBC_DEBUG(5, "*buf1: %p, buf2: %p, buf3: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2, NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2,
opargs.buf3, opargs.count, opargs.datatype); opargs.count, opargs.datatype);
/* get buffers */ /* get buffers */
if(opargs.tmpbuf1) { if(opargs.tmpbuf1) {
buf1=(char*)handle->tmpbuf+(long)opargs.buf1; buf1=(char*)handle->tmpbuf+(long)opargs.buf1;
@ -471,12 +487,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) {
} else { } else {
buf2=opargs.buf2; buf2=opargs.buf2;
} }
if(opargs.tmpbuf3) { ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype);
buf3=(char*)handle->tmpbuf+(long)opargs.buf3;
} else {
buf3=opargs.buf3;
}
ompi_3buff_op_reduce(opargs.op, buf1, buf2, buf3, opargs.count, opargs.datatype);
break; break;
case COPY: case COPY:
NBC_DEBUG(5, " COPY (offset %li) ", offset); NBC_DEBUG(5, " COPY (offset %li) ", offset);

Просмотреть файл

@ -7,7 +7,7 @@
* rights reserved. * rights reserved.
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science * Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* *
* Author(s): Torsten Hoefler <htor@cs.indiana.edu> * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
@ -16,16 +16,17 @@
#include "nbc_internal.h" #include "nbc_internal.h"
#include "ompi/communicator/communicator.h" #include "ompi/communicator/communicator.h"
#include "ompi/datatype/ompi_datatype.h" #include "ompi/datatype/ompi_datatype.h"
#include "ompi/op/op.h"
#include <assert.h> #include <assert.h>
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, const void *sendbuf,
void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle);
static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf,
void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule,
NBC_Handle *handle); NBC_Handle *handle);
static inline int allred_sched_linear(int rank, int p, const void *sendbuf, void *recvbuf, int count, static inline int allred_sched_linear(int rank, int p, const void *sendbuf, void *recvbuf, int count,
MPI_Datatype datatype, MPI_Op op, int ext, int size, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op, int ext, int size,
NBC_Schedule *schedule, NBC_Handle *handle); NBC_Schedule *schedule, NBC_Handle *handle);
#ifdef NBC_CACHE_SCHEDULE #ifdef NBC_CACHE_SCHEDULE
@ -62,6 +63,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M
char inplace; char inplace;
NBC_Handle *handle; NBC_Handle *handle;
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
ptrdiff_t span, gap;
NBC_IN_PLACE(sendbuf, recvbuf, inplace); NBC_IN_PLACE(sendbuf, recvbuf, inplace);
@ -85,7 +87,8 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M
return res; return res;
} }
handle->tmpbuf = malloc (ext * count); span = opal_datatype_span(&datatype->super, count, &gap);
handle->tmpbuf = malloc (span);
if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
@ -101,7 +104,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M
} }
/* algorithm selection */ /* algorithm selection */
if(p < 4 || size*count < 65536 || inplace) { if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) {
alg = NBC_ARED_BINOMIAL; alg = NBC_ARED_BINOMIAL;
} else { } else {
alg = NBC_ARED_RING; alg = NBC_ARED_RING;
@ -128,7 +131,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M
switch(alg) { switch(alg) {
case NBC_ARED_BINOMIAL: case NBC_ARED_BINOMIAL:
res = allred_sched_diss(rank, p, count, datatype, sendbuf, recvbuf, op, schedule, handle); res = allred_sched_diss(rank, p, count, datatype, gap, sendbuf, recvbuf, op, inplace, schedule, handle);
break; break;
case NBC_ARED_RING: case NBC_ARED_RING:
res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, handle); res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, handle);
@ -199,6 +202,7 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co
NBC_Schedule *schedule; NBC_Schedule *schedule;
NBC_Handle *handle; NBC_Handle *handle;
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
ptrdiff_t span, gap;
rank = ompi_comm_rank (comm); rank = ompi_comm_rank (comm);
rsize = ompi_comm_remote_size (comm); rsize = ompi_comm_remote_size (comm);
@ -220,7 +224,8 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co
return res; return res;
} }
handle->tmpbuf = malloc (ext * count); span = opal_datatype_span(&datatype->super, count, &gap);
handle->tmpbuf = malloc (span);
if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
@ -235,7 +240,7 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co
/* ensure the schedule is released with the handle on error */ /* ensure the schedule is released with the handle on error */
handle->schedule = schedule; handle->schedule = schedule;
res = allred_sched_linear (rank, rsize, sendbuf, recvbuf, count, datatype, op, res = allred_sched_linear (rank, rsize, sendbuf, recvbuf, count, datatype, gap, op,
ext, size, schedule, handle); ext, size, schedule, handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
@ -296,13 +301,33 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co
if (vrank == 0) rank = root; \ if (vrank == 0) rank = root; \
if (vrank == root) rank = 0; \ if (vrank == root) rank = 0; \
} }
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, const void *sendbuf, void *recvbuf,
MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) {
int root, vrank, maxr, vpeer, peer, res; int root, vrank, maxr, vpeer, peer, res;
char *rbuf, *lbuf, *buf;
int tmprbuf, tmplbuf;
root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */ root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */
RANK2VRANK(rank, vrank, root); RANK2VRANK(rank, vrank, root);
maxr = (int)ceil((log((double)p)/LOG2)); maxr = (int)ceil((log((double)p)/LOG2));
/* ensure the result ends up in recvbuf on vrank 0 */
if (0 == (maxr%2)) {
rbuf = (void *)(-gap);
tmprbuf = true;
lbuf = recvbuf;
tmplbuf = false;
} else {
lbuf = (void *)(-gap);
tmplbuf = true;
rbuf = recvbuf;
tmprbuf = false;
if (inplace) {
res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf) - gap, count, datatype, MPI_COMM_SELF);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
}
}
for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { for (int r = 1, firstred = 1 ; r <= maxr ; ++r) {
if ((vrank % (1 << r)) == 0) { if ((vrank % (1 << r)) == 0) {
@ -311,35 +336,37 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
VRANK2RANK(peer, vpeer, root) VRANK2RANK(peer, vpeer, root)
if (peer < p) { if (peer < p) {
/* we have to wait until we have the data */ /* we have to wait until we have the data */
res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
/* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */
if (firstred && MPI_IN_PLACE != sendbuf) { if (firstred && !inplace) {
/* perform the reduce with the senbuf */ /* perform the reduce with the senbuf */
res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, true); res = NBC_Sched_op (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true);
firstred = 0; firstred = 0;
} else { } else {
/* perform the reduce in my local buffer */ /* perform the reduce in my local buffer */
res = NBC_Sched_op (recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule, true); res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
/* swap left and right buffers */
buf = rbuf; rbuf = lbuf ; lbuf = buf;
tmprbuf ^= 1; tmplbuf ^= 1;
} }
} else { } else {
/* we have to send this round */ /* we have to send this round */
vpeer = vrank - (1 << (r - 1)); vpeer = vrank - (1 << (r - 1));
VRANK2RANK(peer, vpeer, root) VRANK2RANK(peer, vpeer, root)
if (firstred && MPI_IN_PLACE != sendbuf) { if (firstred && !inplace) {
/* we have to use the sendbuf in the first round .. */ /* we have to use the sendbuf in the first round .. */
res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
} else { } else {
/* and the recvbuf in all remeining rounds */ /* and the recvbuf in all remaining rounds */
res = NBC_Sched_send (recvbuf, false, count, datatype, peer, schedule, false); res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule, false);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
@ -373,6 +400,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
} }
} }
if (0 == vrank) assert(lbuf == recvbuf);
/* now send to the right hosts */ /* now send to the right hosts */
for (int r = 0; r < maxr; ++r) { for (int r = 0; r < maxr; ++r) {
if (((vrank + (1 << r) < p) && (vrank < (1 << r))) || (vrank == 0)) { if (((vrank + (1 << r) < p) && (vrank < (1 << r))) || (vrank == 0)) {
@ -550,9 +578,8 @@ static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datat
break; break;
} }
res = NBC_Sched_op ((char *) recvbuf + roffset, false, (char *) sendbuf + roffset, false, res = NBC_Sched_op ((char *) sendbuf + roffset, false, (char *) recvbuf + roffset, false,
(char *) recvbuf + roffset, false, segsizes[relement], datatype, op, schedule, segsizes[relement], datatype, op, schedule, true);
true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
break; break;
} }
@ -590,7 +617,7 @@ static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datat
} }
static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle) { ptrdiff_t gap, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle) {
int res; int res;
if (0 == count) { if (0 == count) {
@ -603,34 +630,56 @@ static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf,
return res; return res;
} }
res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule, false); /* recv my data to the remote root */
if (0 != rank || 1 ==(rsize%2)) {
res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule, false);
} else {
res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, 0, schedule, false);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
if (0 == rank) { if (0 == rank) {
/* wait for data from the remote root */ char *rbuf, *lbuf, *buf;
int tmprbuf, tmplbuf;
res = NBC_Sched_barrier (schedule); res = NBC_Sched_barrier (schedule);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
/* ensure the result ends up in recvbuf */
if (0 == (rsize%2)) {
lbuf = (void *)(-gap);
tmplbuf = true;
rbuf = recvbuf;
tmprbuf = false;
} else {
rbuf = (void *)(-gap);
tmprbuf = true;
lbuf = recvbuf;
tmplbuf = false;
}
/* get data from remote peers and reduce */ /* get data from remote peers and reduce */
for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) { for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) {
res = NBC_Sched_recv (0, true, count, datatype, rpeer, schedule, true); res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, rpeer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
res = NBC_Sched_op (recvbuf, false, 0, true, recvbuf, false, count, datatype, op, res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true);
schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
/* swap left and right buffers */
buf = rbuf; rbuf = lbuf ; lbuf = buf;
tmprbuf ^= 1; tmplbuf ^= 1;
} }
/* exchange our result with the remote root (each root will broadcast to the other's peers) */ /* exchange our result with the remote root (each root will broadcast to the other's peers) */
res = NBC_Sched_recv (0, true, count, datatype, 0, schedule, false); res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, 0, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
@ -643,7 +692,7 @@ static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf,
/* broadcast the result to all remote peers */ /* broadcast the result to all remote peers */
for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) { for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) {
res = NBC_Sched_send (0, true, count, datatype, rpeer, schedule, false); res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rpeer, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }

Просмотреть файл

@ -45,7 +45,7 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_
struct ompi_communicator_t *comm, ompi_request_t ** request, struct ompi_communicator_t *comm, ompi_request_t ** request,
struct mca_coll_base_module_2_1_0_t *module) { struct mca_coll_base_module_2_1_0_t *module) {
int rank, p, res; int rank, p, res;
MPI_Aint ext; ptrdiff_t gap, span;
NBC_Schedule *schedule; NBC_Schedule *schedule;
#ifdef NBC_CACHE_SCHEDULE #ifdef NBC_CACHE_SCHEDULE
NBC_Scan_args *args, *found, search; NBC_Scan_args *args, *found, search;
@ -59,27 +59,27 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_
rank = ompi_comm_rank (comm); rank = ompi_comm_rank (comm);
p = ompi_comm_size (comm); p = ompi_comm_size (comm);
res = ompi_datatype_type_extent(datatype, &ext);
if (MPI_SUCCESS != res) {
NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res);
return res;
}
res = NBC_Init_handle(comm, &handle, libnbc_module); res = NBC_Init_handle(comm, &handle, libnbc_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
if (inplace && rank < p - 1) { span = opal_datatype_span(&datatype->super, count, &gap);
/* need more buffer space for the inplace case */ if (0 < rank) {
handle->tmpbuf = malloc(ext * count * 2); handle->tmpbuf = malloc(span);
} else { if (handle->tmpbuf == NULL) {
handle->tmpbuf = malloc(ext * count); NBC_Return_handle (handle);
} return OMPI_ERR_OUT_OF_RESOURCE;
}
if (handle->tmpbuf == NULL) { if (inplace) {
NBC_Return_handle (handle); NBC_Copy(recvbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm);
return OMPI_ERR_OUT_OF_RESOURCE; } else {
NBC_Copy(sendbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle);
return res;
}
} }
#ifdef NBC_CACHE_SCHEDULE #ifdef NBC_CACHE_SCHEDULE
@ -102,13 +102,7 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_
handle->schedule = schedule; handle->schedule = schedule;
if (rank != 0) { if (rank != 0) {
if (inplace && rank < p - 1) { res = NBC_Sched_recv (recvbuf, false, count, datatype, rank-1, schedule, false);
/* if sendbuf == recvbuf do not clobber the send buffer until it has been combined
* with the incoming data. */
res = NBC_Sched_recv ((void *) (ext * count), true, count, datatype, rank-1, schedule, false);
} else {
res = NBC_Sched_recv (recvbuf, false, count, datatype, rank-1, schedule, false);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
@ -123,15 +117,8 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_
return res; return res;
} }
/* perform the reduce in my temporary buffer */ res = NBC_Sched_op (recvbuf, false, (void *)(-gap), true, count,
/* this cannot be done until handle->tmpbuf is unused :-( so barrier after */ datatype, op, schedule, true);
if (inplace) {
res = NBC_Sched_op (0, true, sendbuf, false, (void *)(ext * count), true, count,
datatype, op, schedule, true);
} else {
res = NBC_Sched_op (0, true, sendbuf, false, recvbuf, false, count, datatype, op,
schedule, true);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
@ -139,24 +126,18 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_
} }
/* send reduced data onward */ /* send reduced data onward */
res = NBC_Sched_send (0, true, count, datatype, rank + 1, schedule, false); res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
if (inplace) {
/* copy the received data into the receive buffer */
res = NBC_Sched_copy ((void *)(ext * count), true, count, datatype, recvbuf,
false, count, datatype, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle);
return res;
}
}
} }
} else if (p > 1) { } else if (p > 1) {
res = NBC_Sched_send (sendbuf, false, count, datatype, 1, schedule, false); if (inplace) {
res = NBC_Sched_send (recvbuf, false, count, datatype, 1, schedule, false);
} else {
res = NBC_Sched_send (sendbuf, false, count, datatype, 1, schedule, false);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;

Просмотреть файл

@ -10,7 +10,7 @@
* *
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science * Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
@ -92,6 +92,7 @@ typedef struct {
MPI_Datatype datatype; MPI_Datatype datatype;
int dest; int dest;
char tmpbuf; char tmpbuf;
bool local;
} NBC_Args_send; } NBC_Args_send;
/* the receive argument struct */ /* the receive argument struct */
@ -102,6 +103,7 @@ typedef struct {
MPI_Datatype datatype; MPI_Datatype datatype;
char tmpbuf; char tmpbuf;
int source; int source;
bool local;
} NBC_Args_recv; } NBC_Args_recv;
/* the operation argument struct */ /* the operation argument struct */
@ -109,10 +111,8 @@ typedef struct {
NBC_Fn_type type; NBC_Fn_type type;
char tmpbuf1; char tmpbuf1;
char tmpbuf2; char tmpbuf2;
char tmpbuf3;
const void *buf1; const void *buf1;
void *buf2; void *buf2;
void *buf3;
MPI_Op op; MPI_Op op;
MPI_Datatype datatype; MPI_Datatype datatype;
int count; int count;
@ -144,8 +144,10 @@ typedef struct {
/* internal function prototypes */ /* internal function prototypes */
int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier); int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier);
int NBC_Sched_local_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest,NBC_Schedule *schedule, bool barrier);
int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier); int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
int NBC_Sched_op (void* buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype,
MPI_Op op, NBC_Schedule *schedule, bool barrier); MPI_Op op, NBC_Schedule *schedule, bool barrier);
int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount,
MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier); MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier);

Просмотреть файл

@ -7,20 +7,23 @@
* rights reserved. * rights reserved.
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science * Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* *
* Author(s): Torsten Hoefler <htor@cs.indiana.edu> * Author(s): Torsten Hoefler <htor@cs.indiana.edu>
* *
*/ */
#include "ompi/op/op.h"
#include "nbc_internal.h" #include "nbc_internal.h"
static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype,
MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle); MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle);
static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize); MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize);
static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype,
MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle);
#ifdef NBC_CACHE_SCHEDULE #ifdef NBC_CACHE_SCHEDULE
@ -55,6 +58,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_
enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN } alg; enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN } alg;
NBC_Handle *handle; NBC_Handle *handle;
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
ptrdiff_t span, gap;
NBC_IN_PLACE(sendbuf, recvbuf, inplace); NBC_IN_PLACE(sendbuf, recvbuf, inplace);
@ -89,19 +93,22 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_
return res; return res;
} }
span = opal_datatype_span(&datatype->super, count, &gap);
/* algorithm selection */ /* algorithm selection */
if (p > 4 || size * count < 65536) { if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) {
alg = NBC_RED_BINOMIAL; alg = NBC_RED_BINOMIAL;
if(rank == root) { if(rank == root) {
/* root reduces in receivebuffer */ /* root reduces in receivebuffer */
handle->tmpbuf = malloc (ext * count); handle->tmpbuf = malloc (span);
redbuf = recvbuf;
} else { } else {
/* recvbuf may not be valid on non-root nodes */ /* recvbuf may not be valid on non-root nodes */
handle->tmpbuf = malloc (ext * count * 2); handle->tmpbuf = malloc (2*span);
redbuf = (char*) handle->tmpbuf + ext * count; redbuf = (char*) handle->tmpbuf + span - gap;
} }
} else { } else {
handle->tmpbuf = malloc (ext * count); handle->tmpbuf = malloc (span);
alg = NBC_RED_CHAIN; alg = NBC_RED_CHAIN;
segsize = 16384/2; segsize = 16384/2;
} }
@ -135,7 +142,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_
switch(alg) { switch(alg) {
case NBC_RED_BINOMIAL: case NBC_RED_BINOMIAL:
res = red_sched_binomial(rank, p, root, sendbuf, recvbuf, count, datatype, op, redbuf, schedule, handle); res = red_sched_binomial(rank, p, root, sendbuf, redbuf, count, datatype, op, inplace, schedule, handle);
break; break;
case NBC_RED_CHAIN: case NBC_RED_CHAIN:
res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle, segsize); res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle, segsize);
@ -201,25 +208,20 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count
struct mca_coll_base_module_2_1_0_t *module) { struct mca_coll_base_module_2_1_0_t *module) {
int rank, res, rsize; int rank, res, rsize;
NBC_Schedule *schedule; NBC_Schedule *schedule;
MPI_Aint ext;
NBC_Handle *handle; NBC_Handle *handle;
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
ptrdiff_t span, gap;
rank = ompi_comm_rank (comm); rank = ompi_comm_rank (comm);
rsize = ompi_comm_remote_size (comm); rsize = ompi_comm_remote_size (comm);
res = ompi_datatype_type_extent (datatype, &ext);
if (MPI_SUCCESS != res) {
NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res);
return res;
}
res = NBC_Init_handle(comm, &handle, libnbc_module); res = NBC_Init_handle(comm, &handle, libnbc_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
handle->tmpbuf = malloc (ext * count); span = opal_datatype_span(&datatype->super, count, &gap);
handle->tmpbuf = malloc (span);
if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
@ -231,7 +233,7 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
res = red_sched_linear (rank, rsize, root, sendbuf, recvbuf, count, datatype, op, schedule, handle); res = red_sched_linear (rank, rsize, root, sendbuf, recvbuf, (void *)(-gap), count, datatype, op, schedule, handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
@ -257,6 +259,8 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count
/* binomial reduce /* binomial reduce
* if op is not commutative, reduce on rank 0, and then send the result to root rank
*
* working principle: * working principle:
* - each node gets a virtual rank vrank * - each node gets a virtual rank vrank
* - the 'root' node get vrank 0 * - the 'root' node get vrank 0
@ -285,65 +289,81 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count
if (vrank == 0) rank = root; \ if (vrank == 0) rank = root; \
if (vrank == root) rank = 0; \ if (vrank == root) rank = 0; \
} }
static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype,
MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle) { MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) {
int vrank, vpeer, peer, res, maxr; int vroot, vrank, vpeer, peer, res, maxr;
char *rbuf, *lbuf, *buf;
int tmprbuf, tmplbuf;
ptrdiff_t gap;
(void)opal_datatype_span(&datatype->super, count, &gap);
RANK2VRANK(rank, vrank, root); if (ompi_op_is_commute(op)) {
vroot = root;
} else {
vroot = 0;
}
RANK2VRANK(rank, vrank, vroot);
maxr = (int)ceil((log((double)p)/LOG2)); maxr = (int)ceil((log((double)p)/LOG2));
/* ensure the result ends up in redbuf on vrank 0 */
if (0 == (maxr%2)) {
rbuf = (void *)(-gap);
tmprbuf = true;
lbuf = redbuf;
tmplbuf = false;
} else {
lbuf = (void *)(-gap);
tmplbuf = true;
rbuf = redbuf;
tmprbuf = false;
if (inplace) {
res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf)-gap, count, datatype, MPI_COMM_SELF);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
}
}
for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { for (int r = 1, firstred = 1 ; r <= maxr ; ++r) {
if ((vrank % (1 << r)) == 0) { if ((vrank % (1 << r)) == 0) {
/* we have to receive this round */ /* we have to receive this round */
vpeer = vrank + (1 << (r - 1)); vpeer = vrank + (1 << (r - 1));
VRANK2RANK(peer, vpeer, root) VRANK2RANK(peer, vpeer, vroot)
if (peer < p) { if (peer < p) {
/* we have to wait until we have the data */ /* we have to wait until we have the data */
res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
/* perform the reduce in my local buffer */ /* perform the reduce in my local buffer */
/* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */
if (firstred) { if (firstred && !inplace) {
if (rank == root) { /* perform the reduce with the senbuf */
/* root is the only one who reduces in the receivebuffer res = NBC_Sched_op (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true);
* take data from sendbuf in first round - save copy */
res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, true);
} else {
/* all others may not have a receive buffer
* take data from sendbuf in first round - save copy */
res = NBC_Sched_op ((char *) redbuf - (intptr_t) handle->tmpbuf, true, sendbuf, false, 0, true, count,
datatype, op, schedule, true);
}
firstred = 0; firstred = 0;
} else { } else {
if(rank == root) { /* perform the reduce in my local buffer */
/* root is the only one who reduces in the receivebuffer */ res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true);
res = NBC_Sched_op (recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule, true);
} else {
/* all others may not have a receive buffer */
res = NBC_Sched_op ((char *) redbuf - (intptr_t) handle->tmpbuf, true, (char *) redbuf - (intptr_t) handle->tmpbuf,
true, 0, true, count, datatype, op, schedule, true);
}
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
/* swap left and right buffers */
buf = rbuf; rbuf = lbuf ; lbuf = buf;
tmprbuf ^= 1; tmplbuf ^= 1;
} }
} else { } else {
/* we have to send this round */ /* we have to send this round */
vpeer = vrank - (1 << (r - 1)); vpeer = vrank - (1 << (r - 1));
VRANK2RANK(peer, vpeer, root) VRANK2RANK(peer, vpeer, vroot)
if (firstred) { if (firstred && !inplace) {
/* we did not reduce anything */ /* we have to use the sendbuf in the first round .. */
res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
} else { } else {
/* we have to use the redbuf the root (which works in receivebuf) is never sending .. */ /* and the redbuf in all remaining rounds */
res = NBC_Sched_send ((char *) redbuf - (intptr_t) handle->tmpbuf, true, count, datatype, peer, schedule, res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule, false);
false);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
@ -354,6 +374,14 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen
break; break;
} }
} }
/* send to root if vroot ! root */
if (vroot != root) {
if (0 == rank) {
res = NBC_Sched_send (redbuf, false, count, datatype, root, schedule, false);
} else if (root == rank) {
res = NBC_Sched_recv (redbuf, false, count, datatype, vroot, schedule, false);
}
}
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -389,18 +417,22 @@ static inline int red_sched_chain (int rank, int p, int root, const void *sendbu
/* last node does not recv */ /* last node does not recv */
if (vrank != p-1) { if (vrank != p-1) {
res = NBC_Sched_recv ((char *) offset, true, thiscount, datatype, rpeer, schedule, true); if (vrank == 0) {
res = NBC_Sched_recv ((char *)recvbuf+offset, false, thiscount, datatype, rpeer, schedule, true);
} else {
res = NBC_Sched_recv ((char *) offset, true, thiscount, datatype, rpeer, schedule, true);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
/* root reduces into receivebuf */ /* root reduces into receivebuf */
if(vrank == 0) { if(vrank == 0) {
res = NBC_Sched_op ((char *) recvbuf + offset, false, (char *) sendbuf + offset, false, (char *) offset, true, res = NBC_Sched_op ((char *) sendbuf + offset, false, (char *) recvbuf + offset, false,
thiscount, datatype, op, schedule, true); thiscount, datatype, op, schedule, true);
} else { } else {
res = NBC_Sched_op ((char *) offset, true, (char *) sendbuf + offset, false, (char *) offset, true, thiscount, res = NBC_Sched_op ((char *) sendbuf + offset, false, (char *) offset, true, thiscount,
datatype, op, schedule, true); datatype, op, schedule, true);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
@ -428,33 +460,51 @@ static inline int red_sched_chain (int rank, int p, int root, const void *sendbu
} }
/* simple linear algorithm for intercommunicators */ /* simple linear algorithm for intercommunicators */
static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype,
MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) {
int res; int res;
char *rbuf, *lbuf, *buf;
int tmprbuf, tmplbuf;
if (0 == count) { if (0 == count) {
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
if (MPI_ROOT == root) { if (MPI_ROOT == root) {
res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule, true); /* ensure the result ends up in recvbuf */
if (0 == (rsize%2)) {
lbuf = tmpbuf;
tmplbuf = true;
rbuf = recvbuf;
tmprbuf = false;
} else {
rbuf = tmpbuf;
tmprbuf = true;
lbuf = recvbuf;
tmplbuf = false;
}
res = NBC_Sched_recv (lbuf, tmplbuf, count, datatype, 0, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
for (int peer = 1 ; peer < rsize ; ++peer) { for (int peer = 1 ; peer < rsize ; ++peer) {
res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
res = NBC_Sched_op (recvbuf, false, 0, true, recvbuf, false, count, datatype, op, schedule, true); res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
/* swap left and right buffers */
buf = rbuf; rbuf = lbuf ; lbuf = buf;
tmprbuf ^= 1; tmplbuf ^= 1;
} }
} else if (MPI_PROC_NULL != root) { } else if (MPI_PROC_NULL != root) {
res = NBC_Sched_send (sendbuf, false, count, datatype, root, schedule, false); res = NBC_Sched_send (sendbuf, false, count, datatype, root, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }

Просмотреть файл

@ -7,7 +7,7 @@
* rights reserved. * rights reserved.
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science * Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2015 The University of Tennessee and The University * Copyright (c) 2015 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
@ -40,10 +40,12 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i
struct mca_coll_base_module_2_1_0_t *module) { struct mca_coll_base_module_2_1_0_t *module) {
int peer, rank, maxr, p, res, count; int peer, rank, maxr, p, res, count;
MPI_Aint ext; MPI_Aint ext;
char *redbuf, *sbuf, inplace; ptrdiff_t gap, span;
char *sbuf, inplace;
NBC_Schedule *schedule; NBC_Schedule *schedule;
NBC_Handle *handle; NBC_Handle *handle;
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
char *rbuf, *lbuf, *buf;
NBC_IN_PLACE(sendbuf, recvbuf, inplace); NBC_IN_PLACE(sendbuf, recvbuf, inplace);
@ -81,13 +83,15 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i
maxr = (int) ceil ((log((double) p) / LOG2)); maxr = (int) ceil ((log((double) p) / LOG2));
handle->tmpbuf = malloc (ext * count * 2); span = opal_datatype_span(&datatype->super, count, &gap);
handle->tmpbuf = malloc (span * 2);
if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
redbuf = (char *) handle->tmpbuf + ext * count; rbuf = (char *)(-gap);
lbuf = (char *)(span - gap);
schedule = OBJ_NEW(NBC_Schedule); schedule = OBJ_NEW(NBC_Schedule);
if (OPAL_UNLIKELY(NULL == schedule)) { if (OPAL_UNLIKELY(NULL == schedule)) {
@ -104,7 +108,7 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i
peer = rank + (1 << (r - 1)); peer = rank + (1 << (r - 1));
if (peer < p) { if (peer < p) {
/* we have to wait until we have the data */ /* we have to wait until we have the data */
res = NBC_Sched_recv(0, true, count, datatype, peer, schedule, true); res = NBC_Sched_recv(rbuf, true, count, datatype, peer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
@ -113,19 +117,19 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i
/* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */
if (firstred) { if (firstred) {
/* take reduce data from the sendbuf in the first round -> save copy */ /* take reduce data from the sendbuf in the first round -> save copy */
res = NBC_Sched_op (redbuf - (intptr_t) handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, res = NBC_Sched_op (sendbuf, false, rbuf, true, count, datatype, op, schedule, true);
op, schedule, true);
firstred = 0; firstred = 0;
} else { } else {
/* perform the reduce in my local buffer */ /* perform the reduce in my local buffer */
res = NBC_Sched_op (redbuf - (intptr_t) handle->tmpbuf, true, redbuf - (intptr_t) handle->tmpbuf, true, res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule, true);
0, true, count, datatype, op, schedule, true);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
/* swap left and right buffers */
buf = rbuf; rbuf = lbuf ; lbuf = buf;
} }
} else { } else {
/* we have to send this round */ /* we have to send this round */
@ -134,8 +138,8 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i
/* we have to send the senbuf */ /* we have to send the senbuf */
res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
} else { } else {
/* we send an already reduced value from redbuf */ /* we send an already reduced value from lbuf */
res = NBC_Sched_send (redbuf - (intptr_t) handle->tmpbuf, true, count, datatype, peer, schedule, false); res = NBC_Sched_send (lbuf, true, count, datatype, peer, schedule, false);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
@ -157,9 +161,9 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i
if (rank == 0) { if (rank == 0) {
for (long int r = 1, offset = 0 ; r < p ; ++r) { for (long int r = 1, offset = 0 ; r < p ; ++r) {
offset += recvcounts[r-1]; offset += recvcounts[r-1];
sbuf = redbuf + offset * ext; sbuf = lbuf + (offset*ext);
/* root sends the right buffer to the right receiver */ /* root sends the right buffer to the right receiver */
res = NBC_Sched_send (sbuf - (intptr_t) handle->tmpbuf, true, recvcounts[r], datatype, r, schedule, res = NBC_Sched_send (sbuf, true, recvcounts[r], datatype, r, schedule,
false); false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
@ -167,7 +171,7 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i
} }
} }
res = NBC_Sched_copy (redbuf - (intptr_t) handle->tmpbuf, true, recvcounts[0], datatype, recvbuf, false, res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false,
recvcounts[0], datatype, schedule, false); recvcounts[0], datatype, schedule, false);
} else { } else {
res = NBC_Sched_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false); res = NBC_Sched_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false);
@ -199,13 +203,15 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i
int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype, int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype,
MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request,
struct mca_coll_base_module_2_1_0_t *module) { struct mca_coll_base_module_2_1_0_t *module) {
int rank, res, count, rsize; int rank, res, count, lsize, rsize;
MPI_Aint ext; MPI_Aint ext;
ptrdiff_t gap, span;
NBC_Schedule *schedule; NBC_Schedule *schedule;
NBC_Handle *handle; NBC_Handle *handle;
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
rank = ompi_comm_rank (comm); rank = ompi_comm_rank (comm);
lsize = ompi_comm_size(comm);
rsize = ompi_comm_remote_size (comm); rsize = ompi_comm_remote_size (comm);
res = ompi_datatype_type_extent (datatype, &ext); res = ompi_datatype_type_extent (datatype, &ext);
@ -215,17 +221,19 @@ int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf,
} }
count = 0; count = 0;
for (int r = 0 ; r < rsize ; ++r) { for (int r = 0 ; r < lsize ; ++r) {
count += recvcounts[r]; count += recvcounts[r];
} }
span = opal_datatype_span(&datatype->super, count, &gap);
res = NBC_Init_handle(comm, &handle, libnbc_module); res = NBC_Init_handle(comm, &handle, libnbc_module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
if (count > 0) { if (count > 0) {
handle->tmpbuf = malloc (2 * ext * count); handle->tmpbuf = malloc (2 * span);
if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
@ -249,44 +257,55 @@ int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf,
} }
if (0 == rank) { if (0 == rank) {
res = NBC_Sched_recv ((void *) 0, true, count, datatype, 0, schedule, true); char *lbuf, *rbuf;
lbuf = (char *)(-gap);
rbuf = (char *)(span-gap);
res = NBC_Sched_recv (lbuf, true, count, datatype, 0, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
for (int peer = 1 ; peer < rsize ; ++peer) { for (int peer = 1 ; peer < rsize ; ++peer) {
res = NBC_Sched_recv ((void *)(ext * count), true, count, datatype, peer, schedule, true); char *tbuf;
res = NBC_Sched_recv (rbuf, true, count, datatype, peer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
res = NBC_Sched_op ((void *) 0, true, (void *)(ext * count), true, (void *) 0, true, count, datatype, res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype,
op, schedule, true); op, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
tbuf = lbuf; lbuf = rbuf; rbuf = tbuf;
} }
/* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */ /* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */
res = NBC_Sched_recv ((void *)(ext * count), true, count, datatype, 0, schedule, false); res = NBC_Sched_recv (rbuf, true, count, datatype, 0, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
res = NBC_Sched_send ((void *) 0, true, count, datatype, 0, schedule, true); res = NBC_Sched_send (lbuf, true, count, datatype, 0, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
/* scatter */ /* do the local scatterv with the local communicator */
for (int peer = 0, offset = ext * count ; peer < rsize ; ++peer) { res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false,
res = NBC_Sched_send ((void *)(uintptr_t) offset, true, recvcounts[peer], datatype, peer, schedule, recvcounts[0], datatype, schedule, false);
false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle);
return res;
}
for (int peer = 1, offset = recvcounts[0] * ext; peer < lsize ; ++peer) {
res = NBC_Sched_local_send (lbuf + offset, true, recvcounts[peer], datatype, peer, schedule,
false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
@ -294,13 +313,13 @@ int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf,
offset += recvcounts[peer] * ext; offset += recvcounts[peer] * ext;
} }
} } else {
/* receive my block */
/* receive my block */ res = NBC_Sched_local_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false);
res = NBC_Sched_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle);
NBC_Return_handle (handle); return res;
return res; }
} }
res = NBC_Sched_commit (schedule); res = NBC_Sched_commit (schedule);

Просмотреть файл

@ -38,6 +38,7 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i
struct mca_coll_base_module_2_1_0_t *module) { struct mca_coll_base_module_2_1_0_t *module) {
int peer, rank, maxr, p, res, count; int peer, rank, maxr, p, res, count;
MPI_Aint ext; MPI_Aint ext;
ptrdiff_t gap, span;
char *redbuf, *sbuf, inplace; char *redbuf, *sbuf, inplace;
NBC_Schedule *schedule; NBC_Schedule *schedule;
NBC_Handle *handle; NBC_Handle *handle;
@ -73,14 +74,19 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i
count = p * recvcount; count = p * recvcount;
if (0 < count) { if (0 < count) {
handle->tmpbuf = malloc (ext*count*2); char *rbuf, *lbuf, *buf;
span = opal_datatype_span(&datatype->super, count, &gap);
handle->tmpbuf = malloc (2*span);
if (NULL == handle->tmpbuf) { if (NULL == handle->tmpbuf) {
OMPI_COLL_LIBNBC_REQUEST_RETURN(handle); OMPI_COLL_LIBNBC_REQUEST_RETURN(handle);
OBJ_RELEASE(schedule); OBJ_RELEASE(schedule);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
redbuf = (char *) handle->tmpbuf + ext * count; rbuf = (void *)(-gap);
lbuf = (char *)(span - gap);
redbuf = (char *) handle->tmpbuf + span - gap;
/* copy data to redbuf if we only have a single node */ /* copy data to redbuf if we only have a single node */
if ((p == 1) && !inplace) { if ((p == 1) && !inplace) {
@ -98,7 +104,7 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i
peer = rank + (1 << (r - 1)); peer = rank + (1 << (r - 1));
if (peer < p) { if (peer < p) {
/* we have to wait until we have the data */ /* we have to wait until we have the data */
res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); res = NBC_Sched_recv (rbuf, true, count, datatype, peer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
@ -106,29 +112,29 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i
if (firstred) { if (firstred) {
/* take reduce data from the sendbuf in the first round -> save copy */ /* take reduce data from the sendbuf in the first round -> save copy */
res = NBC_Sched_op (redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, res = NBC_Sched_op (sendbuf, false, rbuf, true, count, datatype, op, schedule, true);
datatype, op, schedule, true);
firstred = 0; firstred = 0;
} else { } else {
/* perform the reduce in my local buffer */ /* perform the reduce in my local buffer */
res = NBC_Sched_op (redbuf-(unsigned long)handle->tmpbuf, true, redbuf-(unsigned long)handle->tmpbuf, res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule, true);
true, 0, true, count, datatype, op, schedule, true);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
/* swap left and right buffers */
buf = rbuf; rbuf = lbuf ; lbuf = buf;
} }
} else { } else {
/* we have to send this round */ /* we have to send this round */
peer = rank - (1 << (r - 1)); peer = rank - (1 << (r - 1));
if(firstred) { if(firstred) {
/* we have to send the senbuf */ /* we have to send the senbuf */
res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, true); res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
} else { } else {
/* we send an already reduced value from redbuf */ /* we send an already reduced value from redbuf */
res = NBC_Sched_send (redbuf-(unsigned long)handle->tmpbuf, true, count, datatype, peer, schedule, true); res = NBC_Sched_send (lbuf, true, count, datatype, peer, schedule, false);
} }
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
@ -157,17 +163,19 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i
} else { } else {
for (int r = 1, offset = 0 ; r < p ; ++r) { for (int r = 1, offset = 0 ; r < p ; ++r) {
offset += recvcount; offset += recvcount;
sbuf = ((char *)redbuf) + (offset*ext); sbuf = lbuf + (offset*ext);
/* root sends the right buffer to the right receiver */ /* root sends the right buffer to the right receiver */
res = NBC_Sched_send (sbuf-(unsigned long)handle->tmpbuf, true, recvcount, datatype, r, schedule, false); res = NBC_Sched_send (sbuf, true, recvcount, datatype, r, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
} }
res = NBC_Sched_copy (redbuf-(unsigned long)handle->tmpbuf, true, recvcount, datatype, recvbuf, false, recvcount, if ((p != 1) || !inplace) {
datatype, schedule, false); res = NBC_Sched_copy (lbuf, true, recvcount, datatype, recvbuf, false, recvcount,
datatype, schedule, false);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
@ -193,16 +201,18 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype, int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sendbuf, void *recvbuf, int rcount, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_communicator_t *comm, struct ompi_op_t *op, struct ompi_communicator_t *comm,
ompi_request_t **request, struct mca_coll_base_module_2_1_0_t *module) { ompi_request_t **request, struct mca_coll_base_module_2_1_0_t *module) {
int rank, res, count, rsize; int rank, res, count, lsize, rsize;
MPI_Aint ext; MPI_Aint ext;
ptrdiff_t gap, span;
NBC_Schedule *schedule; NBC_Schedule *schedule;
NBC_Handle *handle; NBC_Handle *handle;
ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
rank = ompi_comm_rank (comm); rank = ompi_comm_rank (comm);
lsize = ompi_comm_size (comm);
rsize = ompi_comm_remote_size (comm); rsize = ompi_comm_remote_size (comm);
res = ompi_datatype_type_extent (dtype, &ext); res = ompi_datatype_type_extent (dtype, &ext);
@ -216,10 +226,12 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, i
return res; return res;
} }
count = rcount * rsize; count = rcount * lsize;
span = opal_datatype_span(&dtype->super, count, &gap);
if (count > 0) { if (count > 0) {
handle->tmpbuf = malloc (2 * ext * count); handle->tmpbuf = malloc (2 * span);
if (NULL == handle->tmpbuf) { if (NULL == handle->tmpbuf) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
@ -236,62 +248,60 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, i
handle->schedule = schedule; handle->schedule = schedule;
/* send my data to the remote root */ /* send my data to the remote root */
res = NBC_Sched_send (sbuf, false, count, dtype, 0, schedule, false); res = NBC_Sched_send (sendbuf, false, count, dtype, 0, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
if (0 == rank) { if (0 == rank) {
res = NBC_Sched_recv ((void *) 0, true, count, dtype, 0, schedule, true); char *lbuf, *rbuf;
lbuf = (char *)(-gap);
rbuf = (char *)(span-gap);
res = NBC_Sched_recv (lbuf, true, count, dtype, 0, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
for (int peer = 1 ; peer < rsize ; ++peer) { for (int peer = 1 ; peer < rsize ; ++peer) {
res = NBC_Sched_recv ((void *)(ext * count), true, count, dtype, peer, schedule, true); char *tbuf;
res = NBC_Sched_recv (rbuf, true, count, dtype, peer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
res = NBC_Sched_op ((void *) 0, true, (void *)(ext * count), true, (void *) 0, true, count, dtype, op, res = NBC_Sched_op (lbuf, true, rbuf, true, count, dtype,
schedule, true); op, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
tbuf = lbuf; lbuf = rbuf; rbuf = tbuf;
} }
/* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */ /* do the scatter with the local communicator */
res = NBC_Sched_recv ((void *)(ext * count), true, count, dtype, 0, schedule, false); res = NBC_Sched_copy (lbuf, true, rcount, dtype, recvbuf, false, rcount,
dtype, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
for (int peer = 1 ; peer < lsize ; ++peer) {
res = NBC_Sched_send ((void *) 0, true, count, dtype, 0, schedule, true); res = NBC_Sched_local_send (lbuf + ext * rcount * peer, true, rcount, dtype, peer, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle);
return res;
}
/* scatter */
for (int peer = 0 ; peer < rsize ; ++peer) {
res = NBC_Sched_send ((void *)(ext * (count + peer * rcount)), true, rcount, dtype, peer, schedule, false);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
} }
} }
} } else {
/* receive my block */
/* receive my block */ res = NBC_Sched_local_recv(recvbuf, false, rcount, dtype, 0, schedule, false);
res = NBC_Sched_recv(rbuf, true, rcount, dtype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle);
NBC_Return_handle (handle); return res;
return res; }
} }
/*NBC_PRINT_SCHED(*schedule);*/ /*NBC_PRINT_SCHED(*schedule);*/

Просмотреть файл

@ -36,16 +36,16 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) {
/* linear iscan /* linear iscan
* working principle: * working principle:
* 1. each node (but node 0) receives from left neigbor * 1. each node (but node 0) receives from left neighbor
* 2. performs op * 2. performs op
* 3. all but rank p-1 do sends to it's right neigbor and exits * 3. all but rank p-1 do sends to it's right neighbor and exits
* *
*/ */
int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
struct ompi_communicator_t *comm, ompi_request_t ** request, struct ompi_communicator_t *comm, ompi_request_t ** request,
struct mca_coll_base_module_2_1_0_t *module) { struct mca_coll_base_module_2_1_0_t *module) {
int rank, p, res; int rank, p, res;
MPI_Aint ext; ptrdiff_t gap, span;
NBC_Schedule *schedule; NBC_Schedule *schedule;
char inplace; char inplace;
NBC_Handle *handle; NBC_Handle *handle;
@ -56,13 +56,7 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da
rank = ompi_comm_rank (comm); rank = ompi_comm_rank (comm);
p = ompi_comm_size (comm); p = ompi_comm_size (comm);
res = ompi_datatype_type_extent (datatype, &ext); if (!inplace) {
if (MPI_SUCCESS != res) {
NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res);
return res;
}
if ((rank == 0) && !inplace) {
/* copy data to receivebuf */ /* copy data to receivebuf */
res = NBC_Copy (sendbuf, count, datatype, recvbuf, count, datatype, comm); res = NBC_Copy (sendbuf, count, datatype, recvbuf, count, datatype, comm);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
@ -75,12 +69,6 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da
return res; return res;
} }
handle->tmpbuf = malloc (ext * count);
if (NULL == handle->tmpbuf) {
NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE;
}
#ifdef NBC_CACHE_SCHEDULE #ifdef NBC_CACHE_SCHEDULE
NBC_Scan_args *args, *found, search; NBC_Scan_args *args, *found, search;
@ -103,8 +91,15 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da
handle->schedule = schedule; handle->schedule = schedule;
if(rank != 0) { if(rank != 0) {
span = opal_datatype_span(&datatype->super, count, &gap);
handle->tmpbuf = malloc (span);
if (NULL == handle->tmpbuf) {
NBC_Return_handle (handle);
return OMPI_ERR_OUT_OF_RESOURCE;
}
/* we have to wait until we have the data */ /* we have to wait until we have the data */
res = NBC_Sched_recv (0, true, count, datatype, rank-1, schedule, true); res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);
return res; return res;
@ -112,7 +107,7 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da
/* perform the reduce in my local buffer */ /* perform the reduce in my local buffer */
/* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */
res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, res = NBC_Sched_op ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule,
true); true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
NBC_Return_handle (handle); NBC_Return_handle (handle);