diff --git a/ompi/mca/coll/libnbc/nbc.c b/ompi/mca/coll/libnbc/nbc.c index ef9ace0ec4..6fb717b218 100644 --- a/ompi/mca/coll/libnbc/nbc.c +++ b/ompi/mca/coll/libnbc/nbc.c @@ -10,7 +10,7 @@ * rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler @@ -111,7 +111,7 @@ static int nbc_schedule_round_append (NBC_Schedule *schedule, void *data, int da } /* this function puts a send into the schedule */ -int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) { +static int NBC_Sched_send_internal (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, bool local, NBC_Schedule *schedule, bool barrier) { NBC_Args_send send_args; int ret; @@ -122,6 +122,7 @@ int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype dataty send_args.count = count; send_args.datatype = datatype; send_args.dest = dest; + send_args.local = local; /* append to the round-schedule */ ret = nbc_schedule_round_append (schedule, &send_args, sizeof (send_args), barrier); @@ -134,8 +135,16 @@ int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype dataty return OMPI_SUCCESS; } +int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) { + return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, false, schedule, barrier); +} + +int NBC_Sched_local_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier) { + return NBC_Sched_send_internal (buf, tmpbuf, count, datatype, dest, true, schedule, barrier); +} + /* this function puts a receive into the schedule */ -int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) { +static int NBC_Sched_recv_internal (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, bool local, NBC_Schedule *schedule, bool barrier) { NBC_Args_recv recv_args; int ret; @@ -146,6 +155,7 @@ int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, in recv_args.count = count; recv_args.datatype = datatype; recv_args.source = source; + recv_args.local = local; /* append to the round-schedule */ ret = nbc_schedule_round_append (schedule, &recv_args, sizeof (recv_args), barrier); @@ -158,8 +168,16 @@ int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, in return OMPI_SUCCESS; } +int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) { + return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, false, schedule, barrier); +} + +int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier) { + return NBC_Sched_recv_internal(buf, tmpbuf, count, datatype, source, true, schedule, barrier); +} + /* this function puts an operation into the schedule */ -int NBC_Sched_op (void *buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, +int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, bool barrier) { NBC_Args_op op_args; int ret; @@ -168,10 +186,8 @@ int NBC_Sched_op (void *buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void op_args.type = OP; op_args.buf1 = buf1; op_args.buf2 = buf2; - op_args.buf3 = buf3; op_args.tmpbuf1 = tmpbuf1; op_args.tmpbuf2 = tmpbuf2; - op_args.tmpbuf3 = tmpbuf3; op_args.count = count; op_args.op = op; op_args.datatype = datatype; @@ -182,7 +198,7 @@ int NBC_Sched_op (void *buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void return ret; } - NBC_DEBUG(10, "added op - ends at byte %i\n", nbc_schedule_get_size (schedule)); + NBC_DEBUG(10, "added op2 - ends at byte %i\n", nbc_schedule_get_size (schedule)); return OMPI_SUCCESS; } @@ -373,7 +389,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) { NBC_Args_op opargs; NBC_Args_copy copyargs; NBC_Args_unpack unpackargs; - void *buf1, *buf2, *buf3; + void *buf1, *buf2; /* get round-schedule address */ ptr = handle->schedule->data + handle->row_offset; @@ -410,7 +426,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) { handle->req_array = tmp; res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag, - MCA_PML_BASE_SEND_STANDARD, handle->comm, + MCA_PML_BASE_SEND_STANDARD, sendargs.local?handle->comm->c_local_comm:handle->comm, handle->req_array+handle->req_count - 1)); if (OMPI_SUCCESS != res) { NBC_Error ("Error in MPI_Isend(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count, @@ -444,7 +460,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) { handle->req_array = tmp; - res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, handle->comm, + res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, recvargs.local?handle->comm->c_local_comm:handle->comm, handle->req_array+handle->req_count-1)); if (OMPI_SUCCESS != res) { NBC_Error("Error in MPI_Irecv(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count, @@ -456,10 +472,10 @@ static inline int NBC_Start_round(NBC_Handle *handle) { #endif break; case OP: - NBC_DEBUG(5, " OP (offset %li) ", offset); + NBC_DEBUG(5, " OP2 (offset %li) ", offset); NBC_GET_BYTES(ptr,opargs); - NBC_DEBUG(5, "*buf1: %p, buf2: %p, buf3: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2, - opargs.buf3, opargs.count, opargs.datatype); + NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2, + opargs.count, opargs.datatype); /* get buffers */ if(opargs.tmpbuf1) { buf1=(char*)handle->tmpbuf+(long)opargs.buf1; @@ -471,12 +487,7 @@ static inline int NBC_Start_round(NBC_Handle *handle) { } else { buf2=opargs.buf2; } - if(opargs.tmpbuf3) { - buf3=(char*)handle->tmpbuf+(long)opargs.buf3; - } else { - buf3=opargs.buf3; - } - ompi_3buff_op_reduce(opargs.op, buf1, buf2, buf3, opargs.count, opargs.datatype); + ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype); break; case COPY: NBC_DEBUG(5, " COPY (offset %li) ", offset); diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c index 2e1b0dd00b..0b624f90c4 100644 --- a/ompi/mca/coll/libnbc/nbc_iallreduce.c +++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c @@ -7,7 +7,7 @@ * rights reserved. * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler @@ -16,16 +16,17 @@ #include "nbc_internal.h" #include "ompi/communicator/communicator.h" #include "ompi/datatype/ompi_datatype.h" +#include "ompi/op/op.h" #include -static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, - void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); +static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, const void *sendbuf, + void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle); static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule, NBC_Handle *handle); static inline int allred_sched_linear(int rank, int p, const void *sendbuf, void *recvbuf, int count, - MPI_Datatype datatype, MPI_Op op, int ext, int size, + MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle); #ifdef NBC_CACHE_SCHEDULE @@ -62,6 +63,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M char inplace; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; NBC_IN_PLACE(sendbuf, recvbuf, inplace); @@ -85,7 +87,8 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M return res; } - handle->tmpbuf = malloc (ext * count); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -101,7 +104,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M } /* algorithm selection */ - if(p < 4 || size*count < 65536 || inplace) { + if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) { alg = NBC_ARED_BINOMIAL; } else { alg = NBC_ARED_RING; @@ -128,7 +131,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M switch(alg) { case NBC_ARED_BINOMIAL: - res = allred_sched_diss(rank, p, count, datatype, sendbuf, recvbuf, op, schedule, handle); + res = allred_sched_diss(rank, p, count, datatype, gap, sendbuf, recvbuf, op, inplace, schedule, handle); break; case NBC_ARED_RING: res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, handle); @@ -199,6 +202,7 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; rank = ompi_comm_rank (comm); rsize = ompi_comm_remote_size (comm); @@ -220,7 +224,8 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co return res; } - handle->tmpbuf = malloc (ext * count); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -235,7 +240,7 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co /* ensure the schedule is released with the handle on error */ handle->schedule = schedule; - res = allred_sched_linear (rank, rsize, sendbuf, recvbuf, count, datatype, op, + res = allred_sched_linear (rank, rsize, sendbuf, recvbuf, count, datatype, gap, op, ext, size, schedule, handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); @@ -296,13 +301,33 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co if (vrank == 0) rank = root; \ if (vrank == root) rank = 0; \ } -static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, - MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { +static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, const void *sendbuf, void *recvbuf, + MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) { int root, vrank, maxr, vpeer, peer, res; + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */ RANK2VRANK(rank, vrank, root); maxr = (int)ceil((log((double)p)/LOG2)); + /* ensure the result ends up in recvbuf on vrank 0 */ + if (0 == (maxr%2)) { + rbuf = (void *)(-gap); + tmprbuf = true; + lbuf = recvbuf; + tmplbuf = false; + } else { + lbuf = (void *)(-gap); + tmplbuf = true; + rbuf = recvbuf; + tmprbuf = false; + if (inplace) { + res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf) - gap, count, datatype, MPI_COMM_SELF); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + } for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { if ((vrank % (1 << r)) == 0) { @@ -311,35 +336,37 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat VRANK2RANK(peer, vpeer, root) if (peer < p) { /* we have to wait until we have the data */ - res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ - if (firstred && MPI_IN_PLACE != sendbuf) { + if (firstred && !inplace) { /* perform the reduce with the senbuf */ - res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, true); + res = NBC_Sched_op (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true); firstred = 0; } else { /* perform the reduce in my local buffer */ - res = NBC_Sched_op (recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule, true); + res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true); } - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } } else { /* we have to send this round */ vpeer = vrank - (1 << (r - 1)); VRANK2RANK(peer, vpeer, root) - if (firstred && MPI_IN_PLACE != sendbuf) { + if (firstred && !inplace) { /* we have to use the sendbuf in the first round .. */ res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { - /* and the recvbuf in all remeining rounds */ - res = NBC_Sched_send (recvbuf, false, count, datatype, peer, schedule, false); + /* and the recvbuf in all remaining rounds */ + res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule, false); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -373,6 +400,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat } } + if (0 == vrank) assert(lbuf == recvbuf); /* now send to the right hosts */ for (int r = 0; r < maxr; ++r) { if (((vrank + (1 << r) < p) && (vrank < (1 << r))) || (vrank == 0)) { @@ -550,9 +578,8 @@ static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datat break; } - res = NBC_Sched_op ((char *) recvbuf + roffset, false, (char *) sendbuf + roffset, false, - (char *) recvbuf + roffset, false, segsizes[relement], datatype, op, schedule, - true); + res = NBC_Sched_op ((char *) sendbuf + roffset, false, (char *) recvbuf + roffset, false, + segsizes[relement], datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { break; } @@ -590,7 +617,7 @@ static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datat } static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, - MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle) { + ptrdiff_t gap, MPI_Op op, int ext, int size, NBC_Schedule *schedule, NBC_Handle *handle) { int res; if (0 == count) { @@ -603,34 +630,56 @@ static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf, return res; } - res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule, false); + /* recv my data to the remote root */ + if (0 != rank || 1 ==(rsize%2)) { + res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule, false); + } else { + res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, 0, schedule, false); + } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } if (0 == rank) { - /* wait for data from the remote root */ + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; + res = NBC_Sched_barrier (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* ensure the result ends up in recvbuf */ + if (0 == (rsize%2)) { + lbuf = (void *)(-gap); + tmplbuf = true; + rbuf = recvbuf; + tmprbuf = false; + } else { + rbuf = (void *)(-gap); + tmprbuf = true; + lbuf = recvbuf; + tmplbuf = false; + } + /* get data from remote peers and reduce */ for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) { - res = NBC_Sched_recv (0, true, count, datatype, rpeer, schedule, true); + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, rpeer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } - res = NBC_Sched_op (recvbuf, false, 0, true, recvbuf, false, count, datatype, op, - schedule, true); + res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } /* exchange our result with the remote root (each root will broadcast to the other's peers) */ - res = NBC_Sched_recv (0, true, count, datatype, 0, schedule, false); + res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } @@ -643,7 +692,7 @@ static inline int allred_sched_linear(int rank, int rsize, const void *sendbuf, /* broadcast the result to all remote peers */ for (int rpeer = 1 ; rpeer < rsize ; ++rpeer) { - res = NBC_Sched_send (0, true, count, datatype, rpeer, schedule, false); + res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rpeer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } diff --git a/ompi/mca/coll/libnbc/nbc_iexscan.c b/ompi/mca/coll/libnbc/nbc_iexscan.c index 736f16c50b..73946e7dfa 100644 --- a/ompi/mca/coll/libnbc/nbc_iexscan.c +++ b/ompi/mca/coll/libnbc/nbc_iexscan.c @@ -45,7 +45,7 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; - MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; @@ -59,27 +59,27 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); - res = ompi_datatype_type_extent(datatype, &ext); - if (MPI_SUCCESS != res) { - NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); - return res; - } - res = NBC_Init_handle(comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } - if (inplace && rank < p - 1) { - /* need more buffer space for the inplace case */ - handle->tmpbuf = malloc(ext * count * 2); - } else { - handle->tmpbuf = malloc(ext * count); - } - - if (handle->tmpbuf == NULL) { - NBC_Return_handle (handle); - return OMPI_ERR_OUT_OF_RESOURCE; + span = opal_datatype_span(&datatype->super, count, &gap); + if (0 < rank) { + handle->tmpbuf = malloc(span); + if (handle->tmpbuf == NULL) { + NBC_Return_handle (handle); + return OMPI_ERR_OUT_OF_RESOURCE; + } + if (inplace) { + NBC_Copy(recvbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm); + } else { + NBC_Copy(sendbuf, count, datatype, (char *)handle->tmpbuf-gap, count, datatype, comm); + } + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); + return res; + } } #ifdef NBC_CACHE_SCHEDULE @@ -102,13 +102,7 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ handle->schedule = schedule; if (rank != 0) { - if (inplace && rank < p - 1) { - /* if sendbuf == recvbuf do not clobber the send buffer until it has been combined - * with the incoming data. */ - res = NBC_Sched_recv ((void *) (ext * count), true, count, datatype, rank-1, schedule, false); - } else { - res = NBC_Sched_recv (recvbuf, false, count, datatype, rank-1, schedule, false); - } + res = NBC_Sched_recv (recvbuf, false, count, datatype, rank-1, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); @@ -123,15 +117,8 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ return res; } - /* perform the reduce in my temporary buffer */ - /* this cannot be done until handle->tmpbuf is unused :-( so barrier after */ - if (inplace) { - res = NBC_Sched_op (0, true, sendbuf, false, (void *)(ext * count), true, count, - datatype, op, schedule, true); - } else { - res = NBC_Sched_op (0, true, sendbuf, false, recvbuf, false, count, datatype, op, - schedule, true); - } + res = NBC_Sched_op (recvbuf, false, (void *)(-gap), true, count, + datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); @@ -139,24 +126,18 @@ int ompi_coll_libnbc_iexscan(const void* sendbuf, void* recvbuf, int count, MPI_ } /* send reduced data onward */ - res = NBC_Sched_send (0, true, count, datatype, rank + 1, schedule, false); + res = NBC_Sched_send ((void *)(-gap), true, count, datatype, rank + 1, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - - if (inplace) { - /* copy the received data into the receive buffer */ - res = NBC_Sched_copy ((void *)(ext * count), true, count, datatype, recvbuf, - false, count, datatype, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - NBC_Return_handle (handle); - return res; - } - } } } else if (p > 1) { - res = NBC_Sched_send (sendbuf, false, count, datatype, 1, schedule, false); + if (inplace) { + res = NBC_Sched_send (recvbuf, false, count, datatype, 1, schedule, false); + } else { + res = NBC_Sched_send (sendbuf, false, count, datatype, 1, schedule, false); + } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h index 7b7e3210f9..76a248d3d9 100644 --- a/ompi/mca/coll/libnbc/nbc_internal.h +++ b/ompi/mca/coll/libnbc/nbc_internal.h @@ -10,7 +10,7 @@ * * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. @@ -92,6 +92,7 @@ typedef struct { MPI_Datatype datatype; int dest; char tmpbuf; + bool local; } NBC_Args_send; /* the receive argument struct */ @@ -102,6 +103,7 @@ typedef struct { MPI_Datatype datatype; char tmpbuf; int source; + bool local; } NBC_Args_recv; /* the operation argument struct */ @@ -109,10 +111,8 @@ typedef struct { NBC_Fn_type type; char tmpbuf1; char tmpbuf2; - char tmpbuf3; const void *buf1; void *buf2; - void *buf3; MPI_Op op; MPI_Datatype datatype; int count; @@ -144,8 +144,10 @@ typedef struct { /* internal function prototypes */ int NBC_Sched_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier); +int NBC_Sched_local_send (const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest,NBC_Schedule *schedule, bool barrier); int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier); -int NBC_Sched_op (void* buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, +int NBC_Sched_local_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier); +int NBC_Sched_op (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, bool barrier); int NBC_Sched_copy (void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier); diff --git a/ompi/mca/coll/libnbc/nbc_ireduce.c b/ompi/mca/coll/libnbc/nbc_ireduce.c index 0045deb6a5..387988a4f5 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce.c @@ -7,20 +7,23 @@ * rights reserved. * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Author(s): Torsten Hoefler * */ + +#include "ompi/op/op.h" + #include "nbc_internal.h" -static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, - MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle); +static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype, + MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle); static inline int red_sched_chain (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int ext, size_t size, NBC_Schedule *schedule, NBC_Handle *handle, int fragsize); -static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, +static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle); #ifdef NBC_CACHE_SCHEDULE @@ -55,6 +58,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_ enum { NBC_RED_BINOMIAL, NBC_RED_CHAIN } alg; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; NBC_IN_PLACE(sendbuf, recvbuf, inplace); @@ -89,19 +93,22 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_ return res; } + span = opal_datatype_span(&datatype->super, count, &gap); + /* algorithm selection */ - if (p > 4 || size * count < 65536) { + if (p > 4 || size * count < 65536 || !ompi_op_is_commute(op)) { alg = NBC_RED_BINOMIAL; if(rank == root) { /* root reduces in receivebuffer */ - handle->tmpbuf = malloc (ext * count); + handle->tmpbuf = malloc (span); + redbuf = recvbuf; } else { /* recvbuf may not be valid on non-root nodes */ - handle->tmpbuf = malloc (ext * count * 2); - redbuf = (char*) handle->tmpbuf + ext * count; + handle->tmpbuf = malloc (2*span); + redbuf = (char*) handle->tmpbuf + span - gap; } } else { - handle->tmpbuf = malloc (ext * count); + handle->tmpbuf = malloc (span); alg = NBC_RED_CHAIN; segsize = 16384/2; } @@ -135,7 +142,7 @@ int ompi_coll_libnbc_ireduce(const void* sendbuf, void* recvbuf, int count, MPI_ switch(alg) { case NBC_RED_BINOMIAL: - res = red_sched_binomial(rank, p, root, sendbuf, recvbuf, count, datatype, op, redbuf, schedule, handle); + res = red_sched_binomial(rank, p, root, sendbuf, redbuf, count, datatype, op, inplace, schedule, handle); break; case NBC_RED_CHAIN: res = red_sched_chain(rank, p, root, sendbuf, recvbuf, count, datatype, op, ext, size, schedule, handle, segsize); @@ -201,25 +208,20 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count struct mca_coll_base_module_2_1_0_t *module) { int rank, res, rsize; NBC_Schedule *schedule; - MPI_Aint ext; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + ptrdiff_t span, gap; rank = ompi_comm_rank (comm); rsize = ompi_comm_remote_size (comm); - res = ompi_datatype_type_extent (datatype, &ext); - if (MPI_SUCCESS != res) { - NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); - return res; - } - res = NBC_Init_handle(comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } - handle->tmpbuf = malloc (ext * count); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -231,7 +233,7 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count return OMPI_ERR_OUT_OF_RESOURCE; } - res = red_sched_linear (rank, rsize, root, sendbuf, recvbuf, count, datatype, op, schedule, handle); + res = red_sched_linear (rank, rsize, root, sendbuf, recvbuf, (void *)(-gap), count, datatype, op, schedule, handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -257,6 +259,8 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count /* binomial reduce + * if op is not commutative, reduce on rank 0, and then send the result to root rank + * * working principle: * - each node gets a virtual rank vrank * - the 'root' node get vrank 0 @@ -285,65 +289,81 @@ int ompi_coll_libnbc_ireduce_inter(const void* sendbuf, void* recvbuf, int count if (vrank == 0) rank = root; \ if (vrank == root) rank = 0; \ } -static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, - MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle) { - int vrank, vpeer, peer, res, maxr; +static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *redbuf, int count, MPI_Datatype datatype, + MPI_Op op, char inplace, NBC_Schedule *schedule, NBC_Handle *handle) { + int vroot, vrank, vpeer, peer, res, maxr; + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; + ptrdiff_t gap; + (void)opal_datatype_span(&datatype->super, count, &gap); - RANK2VRANK(rank, vrank, root); + if (ompi_op_is_commute(op)) { + vroot = root; + } else { + vroot = 0; + } + RANK2VRANK(rank, vrank, vroot); maxr = (int)ceil((log((double)p)/LOG2)); + /* ensure the result ends up in redbuf on vrank 0 */ + if (0 == (maxr%2)) { + rbuf = (void *)(-gap); + tmprbuf = true; + lbuf = redbuf; + tmplbuf = false; + } else { + lbuf = (void *)(-gap); + tmplbuf = true; + rbuf = redbuf; + tmprbuf = false; + if (inplace) { + res = NBC_Copy(rbuf, count, datatype, ((char *)handle->tmpbuf)-gap, count, datatype, MPI_COMM_SELF); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + return res; + } + } + } + for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { if ((vrank % (1 << r)) == 0) { /* we have to receive this round */ vpeer = vrank + (1 << (r - 1)); - VRANK2RANK(peer, vpeer, root) + VRANK2RANK(peer, vpeer, vroot) if (peer < p) { /* we have to wait until we have the data */ - res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } /* perform the reduce in my local buffer */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ - if (firstred) { - if (rank == root) { - /* root is the only one who reduces in the receivebuffer - * take data from sendbuf in first round - save copy */ - res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, true); - } else { - /* all others may not have a receive buffer - * take data from sendbuf in first round - save copy */ - res = NBC_Sched_op ((char *) redbuf - (intptr_t) handle->tmpbuf, true, sendbuf, false, 0, true, count, - datatype, op, schedule, true); - } + if (firstred && !inplace) { + /* perform the reduce with the senbuf */ + res = NBC_Sched_op (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true); firstred = 0; } else { - if(rank == root) { - /* root is the only one who reduces in the receivebuffer */ - res = NBC_Sched_op (recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule, true); - } else { - /* all others may not have a receive buffer */ - res = NBC_Sched_op ((char *) redbuf - (intptr_t) handle->tmpbuf, true, (char *) redbuf - (intptr_t) handle->tmpbuf, - true, 0, true, count, datatype, op, schedule, true); - } + /* perform the reduce in my local buffer */ + res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } } else { /* we have to send this round */ vpeer = vrank - (1 << (r - 1)); - VRANK2RANK(peer, vpeer, root) - if (firstred) { - /* we did not reduce anything */ + VRANK2RANK(peer, vpeer, vroot) + if (firstred && !inplace) { + /* we have to use the sendbuf in the first round .. */ res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { - /* we have to use the redbuf the root (which works in receivebuf) is never sending .. */ - res = NBC_Sched_send ((char *) redbuf - (intptr_t) handle->tmpbuf, true, count, datatype, peer, schedule, - false); + /* and the redbuf in all remaining rounds */ + res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule, false); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -354,6 +374,14 @@ static inline int red_sched_binomial (int rank, int p, int root, const void *sen break; } } + /* send to root if vroot ! root */ + if (vroot != root) { + if (0 == rank) { + res = NBC_Sched_send (redbuf, false, count, datatype, root, schedule, false); + } else if (root == rank) { + res = NBC_Sched_recv (redbuf, false, count, datatype, vroot, schedule, false); + } + } return OMPI_SUCCESS; } @@ -389,18 +417,22 @@ static inline int red_sched_chain (int rank, int p, int root, const void *sendbu /* last node does not recv */ if (vrank != p-1) { - res = NBC_Sched_recv ((char *) offset, true, thiscount, datatype, rpeer, schedule, true); + if (vrank == 0) { + res = NBC_Sched_recv ((char *)recvbuf+offset, false, thiscount, datatype, rpeer, schedule, true); + } else { + res = NBC_Sched_recv ((char *) offset, true, thiscount, datatype, rpeer, schedule, true); + } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } /* root reduces into receivebuf */ if(vrank == 0) { - res = NBC_Sched_op ((char *) recvbuf + offset, false, (char *) sendbuf + offset, false, (char *) offset, true, - thiscount, datatype, op, schedule, true); + res = NBC_Sched_op ((char *) sendbuf + offset, false, (char *) recvbuf + offset, false, + thiscount, datatype, op, schedule, true); } else { - res = NBC_Sched_op ((char *) offset, true, (char *) sendbuf + offset, false, (char *) offset, true, thiscount, - datatype, op, schedule, true); + res = NBC_Sched_op ((char *) sendbuf + offset, false, (char *) offset, true, thiscount, + datatype, op, schedule, true); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -428,33 +460,51 @@ static inline int red_sched_chain (int rank, int p, int root, const void *sendbu } /* simple linear algorithm for intercommunicators */ -static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, +static inline int red_sched_linear (int rank, int rsize, int root, const void *sendbuf, void *recvbuf, void *tmpbuf, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { int res; + char *rbuf, *lbuf, *buf; + int tmprbuf, tmplbuf; if (0 == count) { return OMPI_SUCCESS; } if (MPI_ROOT == root) { - res = NBC_Sched_recv (recvbuf, false, count, datatype, 0, schedule, true); + /* ensure the result ends up in recvbuf */ + if (0 == (rsize%2)) { + lbuf = tmpbuf; + tmplbuf = true; + rbuf = recvbuf; + tmprbuf = false; + } else { + rbuf = tmpbuf; + tmprbuf = true; + lbuf = recvbuf; + tmplbuf = false; + } + + res = NBC_Sched_recv (lbuf, tmplbuf, count, datatype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } for (int peer = 1 ; peer < rsize ; ++peer) { - res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } - res = NBC_Sched_op (recvbuf, false, 0, true, recvbuf, false, count, datatype, op, schedule, true); + res = NBC_Sched_op (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; + tmprbuf ^= 1; tmplbuf ^= 1; } } else if (MPI_PROC_NULL != root) { - res = NBC_Sched_send (sendbuf, false, count, datatype, root, schedule, false); + res = NBC_Sched_send (sendbuf, false, count, datatype, root, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c index cd1dad14e7..c5fb0fdbbb 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter.c @@ -7,7 +7,7 @@ * rights reserved. * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -40,10 +40,12 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i struct mca_coll_base_module_2_1_0_t *module) { int peer, rank, maxr, p, res, count; MPI_Aint ext; - char *redbuf, *sbuf, inplace; + ptrdiff_t gap, span; + char *sbuf, inplace; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; + char *rbuf, *lbuf, *buf; NBC_IN_PLACE(sendbuf, recvbuf, inplace); @@ -81,13 +83,15 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i maxr = (int) ceil ((log((double) p) / LOG2)); - handle->tmpbuf = malloc (ext * count * 2); + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span * 2); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } - redbuf = (char *) handle->tmpbuf + ext * count; + rbuf = (char *)(-gap); + lbuf = (char *)(span - gap); schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { @@ -104,7 +108,7 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i peer = rank + (1 << (r - 1)); if (peer < p) { /* we have to wait until we have the data */ - res = NBC_Sched_recv(0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv(rbuf, true, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -113,19 +117,19 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ if (firstred) { /* take reduce data from the sendbuf in the first round -> save copy */ - res = NBC_Sched_op (redbuf - (intptr_t) handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, - op, schedule, true); + res = NBC_Sched_op (sendbuf, false, rbuf, true, count, datatype, op, schedule, true); firstred = 0; } else { /* perform the reduce in my local buffer */ - res = NBC_Sched_op (redbuf - (intptr_t) handle->tmpbuf, true, redbuf - (intptr_t) handle->tmpbuf, true, - 0, true, count, datatype, op, schedule, true); + res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule, true); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; } } else { /* we have to send this round */ @@ -134,8 +138,8 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i /* we have to send the senbuf */ res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { - /* we send an already reduced value from redbuf */ - res = NBC_Sched_send (redbuf - (intptr_t) handle->tmpbuf, true, count, datatype, peer, schedule, false); + /* we send an already reduced value from lbuf */ + res = NBC_Sched_send (lbuf, true, count, datatype, peer, schedule, false); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); @@ -157,9 +161,9 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i if (rank == 0) { for (long int r = 1, offset = 0 ; r < p ; ++r) { offset += recvcounts[r-1]; - sbuf = redbuf + offset * ext; + sbuf = lbuf + (offset*ext); /* root sends the right buffer to the right receiver */ - res = NBC_Sched_send (sbuf - (intptr_t) handle->tmpbuf, true, recvcounts[r], datatype, r, schedule, + res = NBC_Sched_send (sbuf, true, recvcounts[r], datatype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); @@ -167,7 +171,7 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i } } - res = NBC_Sched_copy (redbuf - (intptr_t) handle->tmpbuf, true, recvcounts[0], datatype, recvbuf, false, + res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false, recvcounts[0], datatype, schedule, false); } else { res = NBC_Sched_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false); @@ -199,13 +203,15 @@ int ompi_coll_libnbc_ireduce_scatter(const void* sendbuf, void* recvbuf, const i int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, const int *recvcounts, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { - int rank, res, count, rsize; + int rank, res, count, lsize, rsize; MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); + lsize = ompi_comm_size(comm); rsize = ompi_comm_remote_size (comm); res = ompi_datatype_type_extent (datatype, &ext); @@ -215,17 +221,19 @@ int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, } count = 0; - for (int r = 0 ; r < rsize ; ++r) { + for (int r = 0 ; r < lsize ; ++r) { count += recvcounts[r]; } + span = opal_datatype_span(&datatype->super, count, &gap); + res = NBC_Init_handle(comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } if (count > 0) { - handle->tmpbuf = malloc (2 * ext * count); + handle->tmpbuf = malloc (2 * span); if (OPAL_UNLIKELY(NULL == handle->tmpbuf)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -249,44 +257,55 @@ int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, } if (0 == rank) { - res = NBC_Sched_recv ((void *) 0, true, count, datatype, 0, schedule, true); + char *lbuf, *rbuf; + lbuf = (char *)(-gap); + rbuf = (char *)(span-gap); + res = NBC_Sched_recv (lbuf, true, count, datatype, 0, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } for (int peer = 1 ; peer < rsize ; ++peer) { - res = NBC_Sched_recv ((void *)(ext * count), true, count, datatype, peer, schedule, true); + char *tbuf; + res = NBC_Sched_recv (rbuf, true, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - res = NBC_Sched_op ((void *) 0, true, (void *)(ext * count), true, (void *) 0, true, count, datatype, + res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } + tbuf = lbuf; lbuf = rbuf; rbuf = tbuf; } /* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */ - res = NBC_Sched_recv ((void *)(ext * count), true, count, datatype, 0, schedule, false); + res = NBC_Sched_recv (rbuf, true, count, datatype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - res = NBC_Sched_send ((void *) 0, true, count, datatype, 0, schedule, true); + res = NBC_Sched_send (lbuf, true, count, datatype, 0, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - /* scatter */ - for (int peer = 0, offset = ext * count ; peer < rsize ; ++peer) { - res = NBC_Sched_send ((void *)(uintptr_t) offset, true, recvcounts[peer], datatype, peer, schedule, - false); + /* do the local scatterv with the local communicator */ + res = NBC_Sched_copy (lbuf, true, recvcounts[0], datatype, recvbuf, false, + recvcounts[0], datatype, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); + return res; + } + for (int peer = 1, offset = recvcounts[0] * ext; peer < lsize ; ++peer) { + res = NBC_Sched_local_send (lbuf + offset, true, recvcounts[peer], datatype, peer, schedule, + false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -294,13 +313,13 @@ int ompi_coll_libnbc_ireduce_scatter_inter (const void* sendbuf, void* recvbuf, offset += recvcounts[peer] * ext; } - } - - /* receive my block */ - res = NBC_Sched_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - NBC_Return_handle (handle); - return res; + } else { + /* receive my block */ + res = NBC_Sched_local_recv (recvbuf, false, recvcounts[rank], datatype, 0, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); + return res; + } } res = NBC_Sched_commit (schedule); diff --git a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c index 4d553c6f55..f0c6721889 100644 --- a/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c +++ b/ompi/mca/coll/libnbc/nbc_ireduce_scatter_block.c @@ -38,6 +38,7 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i struct mca_coll_base_module_2_1_0_t *module) { int peer, rank, maxr, p, res, count; MPI_Aint ext; + ptrdiff_t gap, span; char *redbuf, *sbuf, inplace; NBC_Schedule *schedule; NBC_Handle *handle; @@ -73,14 +74,19 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i count = p * recvcount; if (0 < count) { - handle->tmpbuf = malloc (ext*count*2); + char *rbuf, *lbuf, *buf; + + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (2*span); if (NULL == handle->tmpbuf) { OMPI_COLL_LIBNBC_REQUEST_RETURN(handle); OBJ_RELEASE(schedule); return OMPI_ERR_OUT_OF_RESOURCE; } - redbuf = (char *) handle->tmpbuf + ext * count; + rbuf = (void *)(-gap); + lbuf = (char *)(span - gap); + redbuf = (char *) handle->tmpbuf + span - gap; /* copy data to redbuf if we only have a single node */ if ((p == 1) && !inplace) { @@ -98,7 +104,7 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i peer = rank + (1 << (r - 1)); if (peer < p) { /* we have to wait until we have the data */ - res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); + res = NBC_Sched_recv (rbuf, true, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -106,29 +112,29 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i if (firstred) { /* take reduce data from the sendbuf in the first round -> save copy */ - res = NBC_Sched_op (redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, - datatype, op, schedule, true); + res = NBC_Sched_op (sendbuf, false, rbuf, true, count, datatype, op, schedule, true); firstred = 0; } else { /* perform the reduce in my local buffer */ - res = NBC_Sched_op (redbuf-(unsigned long)handle->tmpbuf, true, redbuf-(unsigned long)handle->tmpbuf, - true, 0, true, count, datatype, op, schedule, true); + res = NBC_Sched_op (lbuf, true, rbuf, true, count, datatype, op, schedule, true); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } + /* swap left and right buffers */ + buf = rbuf; rbuf = lbuf ; lbuf = buf; } } else { /* we have to send this round */ peer = rank - (1 << (r - 1)); if(firstred) { /* we have to send the senbuf */ - res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, true); + res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false); } else { /* we send an already reduced value from redbuf */ - res = NBC_Sched_send (redbuf-(unsigned long)handle->tmpbuf, true, count, datatype, peer, schedule, true); + res = NBC_Sched_send (lbuf, true, count, datatype, peer, schedule, false); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -157,17 +163,19 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i } else { for (int r = 1, offset = 0 ; r < p ; ++r) { offset += recvcount; - sbuf = ((char *)redbuf) + (offset*ext); + sbuf = lbuf + (offset*ext); /* root sends the right buffer to the right receiver */ - res = NBC_Sched_send (sbuf-(unsigned long)handle->tmpbuf, true, recvcount, datatype, r, schedule, false); + res = NBC_Sched_send (sbuf, true, recvcount, datatype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } } - res = NBC_Sched_copy (redbuf-(unsigned long)handle->tmpbuf, true, recvcount, datatype, recvbuf, false, recvcount, - datatype, schedule, false); + if ((p != 1) || !inplace) { + res = NBC_Sched_copy (lbuf, true, recvcount, datatype, recvbuf, false, recvcount, + datatype, schedule, false); + } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -193,16 +201,18 @@ int ompi_coll_libnbc_ireduce_scatter_block(const void* sendbuf, void* recvbuf, i return OMPI_SUCCESS; } -int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, int rcount, struct ompi_datatype_t *dtype, +int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sendbuf, void *recvbuf, int rcount, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, ompi_request_t **request, struct mca_coll_base_module_2_1_0_t *module) { - int rank, res, count, rsize; + int rank, res, count, lsize, rsize; MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); + lsize = ompi_comm_size (comm); rsize = ompi_comm_remote_size (comm); res = ompi_datatype_type_extent (dtype, &ext); @@ -216,10 +226,12 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, i return res; } - count = rcount * rsize; + count = rcount * lsize; + + span = opal_datatype_span(&dtype->super, count, &gap); if (count > 0) { - handle->tmpbuf = malloc (2 * ext * count); + handle->tmpbuf = malloc (2 * span); if (NULL == handle->tmpbuf) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; @@ -236,62 +248,60 @@ int ompi_coll_libnbc_ireduce_scatter_block_inter(const void *sbuf, void *rbuf, i handle->schedule = schedule; /* send my data to the remote root */ - res = NBC_Sched_send (sbuf, false, count, dtype, 0, schedule, false); + res = NBC_Sched_send (sendbuf, false, count, dtype, 0, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } if (0 == rank) { - res = NBC_Sched_recv ((void *) 0, true, count, dtype, 0, schedule, true); + char *lbuf, *rbuf; + lbuf = (char *)(-gap); + rbuf = (char *)(span-gap); + res = NBC_Sched_recv (lbuf, true, count, dtype, 0, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } for (int peer = 1 ; peer < rsize ; ++peer) { - res = NBC_Sched_recv ((void *)(ext * count), true, count, dtype, peer, schedule, true); + char *tbuf; + res = NBC_Sched_recv (rbuf, true, count, dtype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - res = NBC_Sched_op ((void *) 0, true, (void *)(ext * count), true, (void *) 0, true, count, dtype, op, - schedule, true); + res = NBC_Sched_op (lbuf, true, rbuf, true, count, dtype, + op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } + tbuf = lbuf; lbuf = rbuf; rbuf = tbuf; } - /* exchange data with remote root for scatter phase (we *could* use the local communicator to do the scatter) */ - res = NBC_Sched_recv ((void *)(ext * count), true, count, dtype, 0, schedule, false); + /* do the scatter with the local communicator */ + res = NBC_Sched_copy (lbuf, true, rcount, dtype, recvbuf, false, rcount, + dtype, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } - - res = NBC_Sched_send ((void *) 0, true, count, dtype, 0, schedule, true); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - NBC_Return_handle (handle); - return res; - } - - /* scatter */ - for (int peer = 0 ; peer < rsize ; ++peer) { - res = NBC_Sched_send ((void *)(ext * (count + peer * rcount)), true, rcount, dtype, peer, schedule, false); + for (int peer = 1 ; peer < lsize ; ++peer) { + res = NBC_Sched_local_send (lbuf + ext * rcount * peer, true, rcount, dtype, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } } - } - - /* receive my block */ - res = NBC_Sched_recv(rbuf, true, rcount, dtype, 0, schedule, false); - if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { - NBC_Return_handle (handle); - return res; + } else { + /* receive my block */ + res = NBC_Sched_local_recv(recvbuf, false, rcount, dtype, 0, schedule, false); + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { + NBC_Return_handle (handle); + return res; + } } /*NBC_PRINT_SCHED(*schedule);*/ diff --git a/ompi/mca/coll/libnbc/nbc_iscan.c b/ompi/mca/coll/libnbc/nbc_iscan.c index a239d14ed1..5b8b0bbdc1 100644 --- a/ompi/mca/coll/libnbc/nbc_iscan.c +++ b/ompi/mca/coll/libnbc/nbc_iscan.c @@ -36,16 +36,16 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) { /* linear iscan * working principle: - * 1. each node (but node 0) receives from left neigbor + * 1. each node (but node 0) receives from left neighbor * 2. performs op - * 3. all but rank p-1 do sends to it's right neigbor and exits + * 3. all but rank p-1 do sends to it's right neighbor and exits * */ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; - MPI_Aint ext; + ptrdiff_t gap, span; NBC_Schedule *schedule; char inplace; NBC_Handle *handle; @@ -56,13 +56,7 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); - res = ompi_datatype_type_extent (datatype, &ext); - if (MPI_SUCCESS != res) { - NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); - return res; - } - - if ((rank == 0) && !inplace) { + if (!inplace) { /* copy data to receivebuf */ res = NBC_Copy (sendbuf, count, datatype, recvbuf, count, datatype, comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { @@ -75,12 +69,6 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da return res; } - handle->tmpbuf = malloc (ext * count); - if (NULL == handle->tmpbuf) { - NBC_Return_handle (handle); - return OMPI_ERR_OUT_OF_RESOURCE; - } - #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; @@ -103,8 +91,15 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da handle->schedule = schedule; if(rank != 0) { + span = opal_datatype_span(&datatype->super, count, &gap); + handle->tmpbuf = malloc (span); + if (NULL == handle->tmpbuf) { + NBC_Return_handle (handle); + return OMPI_ERR_OUT_OF_RESOURCE; + } + /* we have to wait until we have the data */ - res = NBC_Sched_recv (0, true, count, datatype, rank-1, schedule, true); + res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; @@ -112,7 +107,7 @@ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Da /* perform the reduce in my local buffer */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ - res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, + res = NBC_Sched_op ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle);