diff --git a/ompi/mca/coll/base/README.memory_management b/ompi/mca/coll/base/README.memory_management new file mode 100644 index 0000000000..1e34f577c1 --- /dev/null +++ b/ompi/mca/coll/base/README.memory_management @@ -0,0 +1,124 @@ + /* This comment applies to all collectives (including the basic + * module) where we allocate a temporary buffer. For the next few + * lines of code, it's tremendously complicated how we decided that + * this was the Right Thing to do. Sit back and enjoy. And prepare + * to have your mind warped. :-) + * + * Recall some definitions (I always get these backwards, so I'm + * going to put them here): + * + * extent: the length from the lower bound to the upper bound -- may + * be considerably larger than the buffer required to hold the data + * (or smaller! But it's easiest to think about when it's larger). + * + * true extent: the exact number of bytes required to hold the data + * in the layout pattern in the datatype. + * + * For example, consider the following buffer (just talking about + * true_lb, extent, and true extent -- extrapolate for true_ub: + * + * A B C + * -------------------------------------------------------- + * | | | + * -------------------------------------------------------- + * + * There are multiple cases: + * + * 1. A is what we give to MPI_Send (and friends), and A is where + * the data starts, and C is where the data ends. In this case: + * + * - extent: C-A + * - true extent: C-A + * - true_lb: 0 + * + * A C + * -------------------------------------------------------- + * | | + * -------------------------------------------------------- + * <=======================extent=========================> + * <======================true extent=====================> + * + * 2. A is what we give to MPI_Send (and friends), B is where the + * data starts, and C is where the data ends. In this case: + * + * - extent: C-A + * - true extent: C-B + * - true_lb: positive + * + * A B C + * -------------------------------------------------------- + * | | User buffer | + * -------------------------------------------------------- + * <=======================extent=========================> + * <===============true extent=============> + * + * 3. B is what we give to MPI_Send (and friends), A is where the + * data starts, and C is where the data ends. In this case: + * + * - extent: C-A + * - true extent: C-A + * - true_lb: negative + * + * A B C + * -------------------------------------------------------- + * | | User buffer | + * -------------------------------------------------------- + * <=======================extent=========================> + * <======================true extent=====================> + * + * 4. MPI_BOTTOM is what we give to MPI_Send (and friends), B is + * where the data starts, and C is where the data ends. In this + * case: + * + * - extent: C-MPI_BOTTOM + * - true extent: C-B + * - true_lb: [potentially very large] positive + * + * MPI_BOTTOM B C + * -------------------------------------------------------- + * | | User buffer | + * -------------------------------------------------------- + * <=======================extent=========================> + * <===============true extent=============> + * + * So in all cases, for a temporary buffer, all we need to malloc() + * is a buffer of size true_extent. We therefore need to know two + * pointer values: what value to give to MPI_Send (and friends) and + * what value to give to free(), because they might not be the same. + * + * Clearly, what we give to free() is exactly what was returned from + * malloc(). That part is easy. :-) + * + * What we give to MPI_Send (and friends) is a bit more complicated. + * Let's take the 4 cases from above: + * + * 1. If A is what we give to MPI_Send and A is where the data + * starts, then clearly we give to MPI_Send what we got back from + * malloc(). + * + * 2. If B is what we get back from malloc, but we give A to + * MPI_Send, then the buffer range [A,B) represents "dead space" + * -- no data will be put there. So it's safe to give B-true_lb to + * MPI_Send. More specifically, the true_lb is positive, so B-true_lb is + * actually A. + * + * 3. If A is what we get back from malloc, and B is what we give to + * MPI_Send, then the true_lb is negative, so A-true_lb will actually equal + * B. + * + * 4. Although this seems like the weirdest case, it's actually + * quite similar to case #2 -- the pointer we give to MPI_Send is + * smaller than the pointer we got back from malloc(). + * + * Hence, in all cases, we give (return_from_malloc - true_lb) to MPI_Send. + * + * This works fine and dandy if we only have (count==1), which we + * rarely do. ;-) So we really need to allocate (true_extent + + * ((count - 1) * extent)) to get enough space for the rest. This may + * be more than is necessary, but it's ok. + * + * Simple, no? :-) + * + */ + + diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c index 14ecc39067..a0fcff8272 100644 --- a/ompi/mca/coll/base/coll_base_allgather.c +++ b/ompi/mca/coll/base/coll_base_allgather.c @@ -167,19 +167,16 @@ int ompi_coll_base_allgather_intra_bruck(const void *sbuf, int scount, - copy blocks from shift buffer starting at block [rank] in rbuf. */ if (0 != rank) { - ptrdiff_t true_extent, true_lb; char *free_buf = NULL, *shift_buf = NULL; + ptrdiff_t span, gap; - err = ompi_datatype_get_true_extent(rdtype, &true_lb, &true_extent); - if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } + span = opal_datatype_span(&rdtype->super, (size - rank) * rcount, &gap); - free_buf = (char*) calloc(((true_extent + - ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount - 1) * rext)), - sizeof(char)); + free_buf = (char*)calloc(span, sizeof(char)); if (NULL == free_buf) { line = __LINE__; err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl; } - shift_buf = free_buf - true_lb; + shift_buf = free_buf - gap; /* 1. copy blocks [0 .. (size - rank - 1)] from rbuf to shift buffer */ err = ompi_datatype_copy_content_same_ddt(rdtype, ((ptrdiff_t)(size - rank) * (ptrdiff_t)rcount), diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index 277ea6abc5..0c3ec93578 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -135,8 +135,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL; - ptrdiff_t true_lb, true_extent, lb, extent; ompi_request_t *reqs[2] = {NULL, NULL}; + OPAL_PTRDIFF_TYPE span, gap; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -154,12 +154,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, } /* Allocate and initialize temporary send buffer */ - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - - inplacebuf = (char*) malloc(true_extent + (ptrdiff_t)(count - 1) * extent); + span = opal_datatype_span(&dtype->super, count, &gap); + inplacebuf = (char*) malloc(span); if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; } if (MPI_IN_PLACE == sbuf) { @@ -629,9 +625,9 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int int segcount, max_segcount, num_phases, phase, block_count, inbi; size_t typelng; char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL}; - ptrdiff_t true_lb, true_extent, lb, extent; ptrdiff_t block_offset, max_real_segsize; ompi_request_t *reqs[2] = {NULL, NULL}; + OPAL_PTRDIFF_TYPE lb, extent, gap; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -649,10 +645,6 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int } /* Determine segment count based on the suggested segment size */ - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } ret = ompi_datatype_type_size( dtype, &typelng); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } segcount = count; @@ -685,7 +677,10 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int early_blockcount, late_blockcount ); COLL_BASE_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi, max_segcount, k); - max_real_segsize = true_extent + (ptrdiff_t)(max_segcount - 1) * extent; + + ret = ompi_datatype_get_extent(dtype, &lb, &extent); + if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } + max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap); /* Allocate and initialize temporary buffers */ inbuf[0] = (char*)malloc(max_real_segsize); @@ -740,8 +735,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int block_count = ((rank < split_rank)? early_blockcount : late_blockcount); COLL_BASE_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase, early_phase_segcount, late_phase_segcount) - phase_count = ((phase < split_phase)? - (early_phase_segcount) : (late_phase_segcount)); + phase_count = ((phase < split_phase)? + (early_phase_segcount) : (late_phase_segcount)); phase_offset = ((phase < split_phase)? ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) : ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase)); diff --git a/ompi/mca/coll/base/coll_base_alltoall.c b/ompi/mca/coll/base/coll_base_alltoall.c index 5b1f13160a..48c13afb59 100644 --- a/ompi/mca/coll/base/coll_base_alltoall.c +++ b/ompi/mca/coll/base/coll_base_alltoall.c @@ -43,10 +43,10 @@ mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err = MPI_SUCCESS, line; + OPAL_PTRDIFF_TYPE ext, gap; MPI_Request *preq; char *tmp_buffer; size_t max_size; - ptrdiff_t ext, true_lb, true_ext; /* Initialize. */ @@ -60,14 +60,14 @@ mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); - ompi_datatype_get_true_extent ( rdtype, &true_lb, &true_ext); - max_size = true_ext + ext * (rcount-1); + max_size = opal_datatype_span(&rdtype->super, rcount, &gap); /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } + tmp_buffer -= gap; max_size = ext * rcount; /* in-place alltoall slow algorithm (but works) */ @@ -199,7 +199,7 @@ int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount, int i, k, line = -1, rank, size, err = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; - ptrdiff_t rlb, slb, tlb, sext, rext, tsext; + OPAL_PTRDIFF_TYPE sext, rext, span, gap; struct ompi_datatype_t *new_ddt; if (MPI_IN_PLACE == sbuf) { @@ -213,15 +213,13 @@ int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount, OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_bruck rank %d", rank)); - err = ompi_datatype_get_extent (sdtype, &slb, &sext); + err = ompi_datatype_type_extent (sdtype, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } - err = ompi_datatype_get_true_extent(sdtype, &tlb, &tsext); - if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } - - err = ompi_datatype_get_extent (rdtype, &rlb, &rext); + err = ompi_datatype_type_extent (rdtype, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } + span = opal_datatype_span(&sdtype->super, size * scount, &gap); displs = (int *) malloc(size * sizeof(int)); if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; } @@ -229,9 +227,9 @@ int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount, if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; } /* tmp buffer allocation for message data */ - tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext); + tmpbuf_free = (char *)malloc(span); if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; } - tmpbuf = tmpbuf_free - slb; + tmpbuf = tmpbuf_free - gap; /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, diff --git a/ompi/mca/coll/base/coll_base_alltoallv.c b/ompi/mca/coll/base/coll_base_alltoallv.c index 7b395a551d..97283d48ac 100644 --- a/ompi/mca/coll/base/coll_base_alltoallv.c +++ b/ompi/mca/coll/base/coll_base_alltoallv.c @@ -38,16 +38,16 @@ int mca_coll_base_alltoallv_intra_basic_inplace(const void *rbuf, const int *rcounts, const int *rdisps, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; MPI_Request *preq; char *tmp_buffer; size_t max_size, rdtype_size; - ptrdiff_t ext; + OPAL_PTRDIFF_TYPE ext, gap; /* Initialize. */ @@ -63,16 +63,17 @@ mca_coll_base_alltoallv_intra_basic_inplace(const void *rbuf, const int *rcounts /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); for (i = 0, max_size = 0 ; i < size ; ++i) { - size_t size = ext * rcounts[i]; - + size_t size = opal_datatype_span(&rdtype->super, rcounts[i], &gap); max_size = size > max_size ? size : max_size; } + /* The gap will always be the same as we are working on the same datatype */ /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } + tmp_buffer += gap; /* in-place alltoallv slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c index bd2004f80f..80f7b70156 100644 --- a/ompi/mca/coll/base/coll_base_gather.c +++ b/ompi/mca/coll/base/coll_base_gather.c @@ -49,8 +49,8 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, char *ptmp = NULL, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; - MPI_Aint sextent, slb, strue_lb, strue_extent; - MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; + MPI_Aint sextent, sgap, ssize; + MPI_Aint rextent, rgap, rsize; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; @@ -64,14 +64,14 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; - ompi_datatype_get_extent(sdtype, &slb, &sextent); - ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent); + ompi_datatype_type_extent(sdtype, &sextent); + ompi_datatype_type_extent(rdtype, &rextent); + ssize = opal_datatype_span(&sdtype->super, scount * size, &sgap); + rsize = opal_datatype_span(&rdtype->super, rcount * size, &rgap); vrank = (rank - root + size) % size; if (rank == root) { - ompi_datatype_get_extent(rdtype, &rlb, &rextent); - ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent); if (0 == root){ /* root on 0, just use the recv buffer */ ptmp = (char *) rbuf; @@ -83,12 +83,12 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ - tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent); + tempbuf = (char *) malloc(rsize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } - ptmp = tempbuf - rtrue_lb; + ptmp = tempbuf - rgap; if (sbuf != MPI_IN_PLACE) { /* copy from sbuf to temp buffer */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, @@ -106,12 +106,12 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, /* other non-leaf nodes, allocate temp buffer for data received from * children, the most we need is half of the total data elements due * to the property of binimoal tree */ - tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent); + tempbuf = (char *) malloc(ssize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } - ptmp = tempbuf - strue_lb; + ptmp = tempbuf - sgap; /* local copy to tempbuf */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, scount, sdtype); diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c index b6ea005f3f..34e7fa0e47 100644 --- a/ompi/mca/coll/base/coll_base_reduce.c +++ b/ompi/mca/coll/base/coll_base_reduce.c @@ -55,18 +55,16 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int origi char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; - ptrdiff_t extent, lower_bound, segment_increment; + ptrdiff_t extent, size, gap, segment_increment; ompi_request_t **sreq = NULL, *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; - size_t typelng; /** * Determine number of segments and number of elements * sent per operation */ - ompi_datatype_get_extent( datatype, &lower_bound, &extent ); - ompi_datatype_type_size( datatype, &typelng ); + ompi_datatype_type_extent( datatype, &extent ); num_segments = (original_count + count_by_segment - 1) / count_by_segment; segment_increment = (ptrdiff_t)count_by_segment * extent; @@ -84,21 +82,19 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int origi /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { - ptrdiff_t true_lower_bound, true_extent, real_segment_size; - ompi_datatype_get_true_extent( datatype, &true_lower_bound, - &true_extent ); + ptrdiff_t real_segment_size; /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ - accumbuf_free = (char*)malloc(true_extent + - (ptrdiff_t)(original_count - 1) * extent); + size = opal_datatype_span(&datatype->super, original_count, &gap); + accumbuf_free = (char*)malloc(size); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } - accumbuf = accumbuf_free - lower_bound; + accumbuf = accumbuf_free - gap; } /* If this is a non-commutative operation we must copy @@ -109,12 +105,12 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int origi (char*)sendtmpbuf); } /* Allocate two buffers for incoming segments */ - real_segment_size = true_extent + (ptrdiff_t)(count_by_segment - 1) * extent; + real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap); inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } - inbuf[0] = inbuf_free[0] - lower_bound; + inbuf[0] = inbuf_free[0] - gap; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { @@ -122,7 +118,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int origi if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } - inbuf[1] = inbuf_free[1] - lower_bound; + inbuf[1] = inbuf_free[1] - gap; } /* reset input buffer index and receive count */ @@ -517,14 +513,13 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv use_this_sendbuf = (void *)sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { - ptrdiff_t tlb, text, lb, ext; + ptrdiff_t dsize, gap; char *tmpbuf = NULL; - ompi_datatype_get_extent(datatype, &lb, &ext); - ompi_datatype_get_true_extent(datatype, &tlb, &text); + dsize = opal_datatype_span(&datatype->super, count, &gap); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { - tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); + tmpbuf = (char *) malloc(dsize); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } @@ -533,7 +528,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { - tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); + tmpbuf = (char *) malloc(dsize); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } @@ -585,8 +580,6 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv * GEF Oct05 after asking Jeff. */ -/* copied function (with appropriate renaming) starts here */ - /* * reduce_lin_intra * @@ -603,7 +596,7 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count mca_coll_base_module_t *module) { int i, rank, err, size; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t extent, dsize, gap; char *free_buffer = NULL; char *pml_buffer = NULL; char *inplace_temp = NULL; @@ -623,151 +616,27 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count return err; } - /* Root receives and reduces messages. Allocate buffer to receive - * messages. This comment applies to all collectives in this basic - * module where we allocate a temporary buffer. For the next few - * lines of code, it's tremendously complicated how we decided that - * this was the Right Thing to do. Sit back and enjoy. And prepare - * to have your mind warped. :-) - * - * Recall some definitions (I always get these backwards, so I'm - * going to put them here): - * - * extent: the length from the lower bound to the upper bound -- may - * be considerably larger than the buffer required to hold the data - * (or smaller! But it's easiest to think about when it's larger). - * - * true extent: the exact number of bytes required to hold the data - * in the layout pattern in the datatype. - * - * For example, consider the following buffer (just talking about - * true_lb, extent, and true extent -- extrapolate for true_ub: - * - * A B C - * -------------------------------------------------------- - * | | | - * -------------------------------------------------------- - * - * There are multiple cases: - * - * 1. A is what we give to MPI_Send (and friends), and A is where - * the data starts, and C is where the data ends. In this case: - * - * - extent: C-A - * - true extent: C-A - * - true_lb: 0 - * - * A C - * -------------------------------------------------------- - * | | - * -------------------------------------------------------- - * <=======================extent=========================> - * <======================true extent=====================> - * - * 2. A is what we give to MPI_Send (and friends), B is where the - * data starts, and C is where the data ends. In this case: - * - * - extent: C-A - * - true extent: C-B - * - true_lb: positive - * - * A B C - * -------------------------------------------------------- - * | | User buffer | - * -------------------------------------------------------- - * <=======================extent=========================> - * <===============true extent=============> - * - * 3. B is what we give to MPI_Send (and friends), A is where the - * data starts, and C is where the data ends. In this case: - * - * - extent: C-A - * - true extent: C-A - * - true_lb: negative - * - * A B C - * -------------------------------------------------------- - * | | User buffer | - * -------------------------------------------------------- - * <=======================extent=========================> - * <======================true extent=====================> - * - * 4. MPI_BOTTOM is what we give to MPI_Send (and friends), B is - * where the data starts, and C is where the data ends. In this - * case: - * - * - extent: C-MPI_BOTTOM - * - true extent: C-B - * - true_lb: [potentially very large] positive - * - * MPI_BOTTOM B C - * -------------------------------------------------------- - * | | User buffer | - * -------------------------------------------------------- - * <=======================extent=========================> - * <===============true extent=============> - * - * So in all cases, for a temporary buffer, all we need to malloc() - * is a buffer of size true_extent. We therefore need to know two - * pointer values: what value to give to MPI_Send (and friends) and - * what value to give to free(), because they might not be the same. - * - * Clearly, what we give to free() is exactly what was returned from - * malloc(). That part is easy. :-) - * - * What we give to MPI_Send (and friends) is a bit more complicated. - * Let's take the 4 cases from above: - * - * 1. If A is what we give to MPI_Send and A is where the data - * starts, then clearly we give to MPI_Send what we got back from - * malloc(). - * - * 2. If B is what we get back from malloc, but we give A to - * MPI_Send, then the buffer range [A,B) represents "dead space" - * -- no data will be put there. So it's safe to give B-true_lb to - * MPI_Send. More specifically, the true_lb is positive, so B-true_lb is - * actually A. - * - * 3. If A is what we get back from malloc, and B is what we give to - * MPI_Send, then the true_lb is negative, so A-true_lb will actually equal - * B. - * - * 4. Although this seems like the weirdest case, it's actually - * quite similar to case #2 -- the pointer we give to MPI_Send is - * smaller than the pointer we got back from malloc(). - * - * Hence, in all cases, we give (return_from_malloc - true_lb) to MPI_Send. - * - * This works fine and dandy if we only have (count==1), which we - * rarely do. ;-) So we really need to allocate (true_extent + - * ((count - 1) * extent)) to get enough space for the rest. This may - * be more than is necessary, but it's ok. - * - * Simple, no? :-) - * - */ - - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); + ompi_datatype_type_extent(dtype, &extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; - inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); + inplace_temp = (char*)malloc(dsize); if (NULL == inplace_temp) { return OMPI_ERR_OUT_OF_RESOURCE; } - rbuf = inplace_temp - true_lb; + rbuf = inplace_temp - gap; } if (size > 1) { - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { if (NULL != inplace_temp) { free(inplace_temp); } return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = free_buffer - true_lb; + pml_buffer = free_buffer - gap; } /* Initialize the receive buffer. */ @@ -823,4 +692,3 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count return MPI_SUCCESS; } -/* copied function (with appropriate renaming) ends here */ diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c index a9e674ca9c..d4c88e25f8 100644 --- a/ompi/mca/coll/base/coll_base_reduce_scatter.c +++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c @@ -76,13 +76,11 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r if (root == rank) { /* We must allocate temporary receive buffer on root to ensure that rbuf is big enough */ - ptrdiff_t lb, extent, tlb, textent; + ptrdiff_t dsize, gap; + dsize = opal_datatype_span(&dtype->super, total_count, &gap); - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &tlb, &textent); - - tmprbuf_free = (char*) malloc(textent + (ptrdiff_t)(total_count - 1) * extent); - tmprbuf = tmprbuf_free - lb; + tmprbuf_free = (char*) malloc(dsize); + tmprbuf = tmprbuf_free - gap; } err = comm->c_coll.coll_reduce (sbuf, tmprbuf, total_count, dtype, op, root, comm, comm->c_coll.coll_reduce_module); @@ -134,7 +132,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, { int i, rank, size, count, err = OMPI_SUCCESS; int tmp_size, remain = 0, tmp_rank, *disps = NULL; - ptrdiff_t true_lb, true_extent, lb, extent, buf_size; + ptrdiff_t extent, buf_size, gap; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; @@ -161,9 +159,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, } /* get datatype information */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - buf_size = true_extent + (ptrdiff_t)(count - 1) * extent; + ompi_datatype_type_extent(dtype, &extent); + buf_size = opal_datatype_span(&dtype->super, count, &gap); /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { @@ -172,7 +169,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, /* Allocate temporary receive buffer. */ recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - true_lb; + recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; @@ -180,7 +177,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf, /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); - result_buf = result_buf_free - true_lb; + result_buf = result_buf_free - gap; /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); @@ -459,9 +456,8 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, const in int inbi, *displs = NULL; char *tmpsend = NULL, *tmprecv = NULL, *accumbuf = NULL, *accumbuf_free = NULL; char *inbuf_free[2] = {NULL, NULL}, *inbuf[2] = {NULL, NULL}; - ptrdiff_t true_lb, true_extent, lb, extent, max_real_segsize; + ptrdiff_t extent, max_real_segsize, dsize, gap; ompi_request_t *reqs[2] = {NULL, NULL}; - size_t typelng; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -500,26 +496,23 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, const in rbuf can be of rcounts[rank] size. - up to two temporary buffers used for communication/computation overlap. */ - ret = ompi_datatype_get_extent(dtype, &lb, &extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - ret = ompi_datatype_type_size( dtype, &typelng); + ret = ompi_datatype_type_extent(dtype, &extent); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } - max_real_segsize = true_extent + (ptrdiff_t)(max_block_count - 1) * extent; + max_real_segsize = opal_datatype_span(&dtype->super, max_block_count, &gap); + dsize = opal_datatype_span(&dtype->super, total_count, &gap); - accumbuf_free = (char*)malloc(true_extent + (ptrdiff_t)(total_count - 1) * extent); + accumbuf_free = (char*)malloc(dsize); if (NULL == accumbuf_free) { ret = -1; line = __LINE__; goto error_hndl; } - accumbuf = accumbuf_free - lb; + accumbuf = accumbuf_free - gap; inbuf_free[0] = (char*)malloc(max_real_segsize); if (NULL == inbuf_free[0]) { ret = -1; line = __LINE__; goto error_hndl; } - inbuf[0] = inbuf_free[0] - lb; + inbuf[0] = inbuf_free[0] - gap; if (size > 2) { inbuf_free[1] = (char*)malloc(max_real_segsize); if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; } - inbuf[1] = inbuf_free[1] - lb; + inbuf[1] = inbuf_free[1] - gap; } /* Handle MPI_IN_PLACE for size > 1 */ diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c index b8b69aef9b..71605595e1 100644 --- a/ompi/mca/coll/base/coll_base_scatter.c +++ b/ompi/mca/coll/base/coll_base_scatter.c @@ -47,10 +47,10 @@ ompi_coll_base_scatter_intra_binomial( const void *sbuf, int scount, char *ptmp, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; - MPI_Aint sextent, slb, strue_lb, strue_extent; - MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; + ptrdiff_t sextent, rextent, ssize, rsize, sgap, rgap; + size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); @@ -62,10 +62,11 @@ ompi_coll_base_scatter_intra_binomial( const void *sbuf, int scount, COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; - ompi_datatype_get_extent(sdtype, &slb, &sextent); - ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent); - ompi_datatype_get_extent(rdtype, &rlb, &rextent); - ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent); + ompi_datatype_type_extent(sdtype, &sextent); + ompi_datatype_type_extent(rdtype, &rextent); + + ssize = opal_datatype_span(&sdtype->super, scount * size, &sgap); + rsize = opal_datatype_span(&rdtype->super, rcount * size, &rgap); vrank = (rank - root + size) % size; ptmp = (char *) rbuf; /* by default suppose leaf nodes, just use rbuf */ @@ -82,12 +83,11 @@ ompi_coll_base_scatter_intra_binomial( const void *sbuf, int scount, } } else { /* root is not on 0, allocate temp buffer for send */ - tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent); + tempbuf = (char *) malloc(ssize); if (NULL == tempbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } - - ptmp = tempbuf - strue_lb; + ptmp = tempbuf - sgap; /* and rotate data so they will eventually in the right place */ err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root), @@ -110,12 +110,11 @@ ompi_coll_base_scatter_intra_binomial( const void *sbuf, int scount, } else if (!(vrank % 2)) { /* non-root, non-leaf nodes, allocte temp buffer for recv * the most we need is rcount*size/2 */ - tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent); + tempbuf = (char *) malloc(rsize); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } - - ptmp = tempbuf - rtrue_lb; + ptmp = tempbuf - rgap; sdtype = rdtype; scount = rcount; @@ -204,7 +203,7 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount, mca_coll_base_module_t *module) { int i, rank, size, err; - ptrdiff_t lb, incr; + ptrdiff_t incr; char *ptmp; /* Initialize */ @@ -223,7 +222,7 @@ ompi_coll_base_scatter_intra_basic_linear(const void *sbuf, int scount, /* I am the root, loop sending data. */ - err = ompi_datatype_get_extent(sdtype, &lb, &incr); + err = ompi_datatype_type_extent(sdtype, &incr); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } diff --git a/ompi/mca/coll/basic/coll_basic_allreduce.c b/ompi/mca/coll/basic/coll_basic_allreduce.c index 68fe0ec4b9..c436556b19 100644 --- a/ompi/mca/coll/basic/coll_basic_allreduce.c +++ b/ompi/mca/coll/basic/coll_basic_allreduce.c @@ -81,8 +81,7 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int err, i, rank, root = 0, rsize, line; - ptrdiff_t lb, extent; - ptrdiff_t true_lb, true_extent; + ptrdiff_t extent, dsize, gap; char *tmpbuf = NULL, *pml_buffer = NULL; ompi_request_t *req[2]; ompi_request_t **reqs = NULL; @@ -100,18 +99,14 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count, * simultaniously. */ /*****************************************************************/ if (rank == root) { - err = ompi_datatype_get_extent(dtype, &lb, &extent); + err = ompi_datatype_type_extent(dtype, &extent); if (OMPI_SUCCESS != err) { return OMPI_ERROR; } - err = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - if (OMPI_SUCCESS != err) { - return OMPI_ERROR; - } - - tmpbuf = (char *) malloc(true_extent + (count - 1) * extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); + tmpbuf = (char *) malloc(dsize); if (NULL == tmpbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto exit; } - pml_buffer = tmpbuf - true_lb; + pml_buffer = tmpbuf - gap; reqs = coll_base_comm_get_reqs(module->base_data, rsize - 1); if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto exit; } diff --git a/ompi/mca/coll/basic/coll_basic_alltoallw.c b/ompi/mca/coll/basic/coll_basic_alltoallw.c index 4b123151e3..cd6e4f7313 100644 --- a/ompi/mca/coll/basic/coll_basic_alltoallw.c +++ b/ompi/mca/coll/basic/coll_basic_alltoallw.c @@ -41,10 +41,10 @@ mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, con struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int i, j, size, rank, err=MPI_SUCCESS, max_size; + int i, j, size, rank, err = MPI_SUCCESS, max_size; MPI_Request *preq, *reqs = NULL; - char *tmp_buffer; - ptrdiff_t ext; + char *tmp_buffer, *save_buffer = NULL; + ptrdiff_t ext, gap; /* Initialize. */ @@ -58,17 +58,17 @@ mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, con /* Find the largest receive amount */ for (i = 0, max_size = 0 ; i < size ; ++i) { - ompi_datatype_type_extent (rdtypes[i], &ext); - ext *= rcounts[i]; + ext = opal_datatype_span(&rdtypes[i]->super, rcounts[i], &gap); max_size = ext > max_size ? ext : max_size; } /* Allocate a temporary buffer */ - tmp_buffer = calloc (max_size, 1); + tmp_buffer = save_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } + tmp_buffer -= gap; reqs = coll_base_comm_get_reqs( module->base_data, 2); /* in-place alltoallw slow algorithm (but works) */ @@ -126,7 +126,7 @@ mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, con error_hndl: /* Free the temporary buffer */ - free (tmp_buffer); + free (save_buffer); if( MPI_SUCCESS != err ) { /* Free the requests. */ if( NULL != reqs ) { ompi_coll_base_free_reqs(reqs, 2); diff --git a/ompi/mca/coll/basic/coll_basic_exscan.c b/ompi/mca/coll/basic/coll_basic_exscan.c index 34ec43fadc..057bcfa48c 100644 --- a/ompi/mca/coll/basic/coll_basic_exscan.c +++ b/ompi/mca/coll/basic/coll_basic_exscan.c @@ -49,7 +49,7 @@ mca_coll_basic_exscan_intra(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int size, rank, err; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t dsize, gap; char *free_buffer = NULL; char *reduce_buffer = NULL; @@ -83,14 +83,13 @@ mca_coll_basic_exscan_intra(const void *sbuf, void *rbuf, int count, /* Get a temporary buffer to perform the reduction into. Rationale * for malloc'ing this size is provided in coll_basic_reduce.c. */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - reduce_buffer = free_buffer - true_lb; + reduce_buffer = free_buffer - gap; err = ompi_datatype_copy_content_same_ddt(dtype, count, reduce_buffer, (char*)sbuf); diff --git a/ompi/mca/coll/basic/coll_basic_reduce.c b/ompi/mca/coll/basic/coll_basic_reduce.c index 20650d0e2e..ad2fd1e6f3 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce.c +++ b/ompi/mca/coll/basic/coll_basic_reduce.c @@ -92,7 +92,7 @@ mca_coll_basic_reduce_log_intra(const void *sbuf, void *rbuf, int count, { int i, size, rank, vrank; int err, peer, dim, mask; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t lb, extent, dsize, gap; char *free_buffer = NULL; char *free_rbuf = NULL; char *pml_buffer = NULL; @@ -120,14 +120,14 @@ mca_coll_basic_reduce_log_intra(const void *sbuf, void *rbuf, int count, * rationale above. */ ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = free_buffer - true_lb; + pml_buffer = free_buffer - gap; /* read the comment about commutative operations (few lines down * the page) */ if (ompi_op_is_commute(op)) { @@ -138,12 +138,12 @@ mca_coll_basic_reduce_log_intra(const void *sbuf, void *rbuf, int count, * rationale above. */ if (MPI_IN_PLACE == sbuf) { - inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); + inplace_temp = (char*)malloc(dsize); if (NULL == inplace_temp) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } - sbuf = inplace_temp - true_lb; + sbuf = inplace_temp - gap; err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, (char*)rbuf); } snd_buffer = (char*)sbuf; @@ -152,12 +152,12 @@ mca_coll_basic_reduce_log_intra(const void *sbuf, void *rbuf, int count, /* root is the only one required to provide a valid rbuf. * Assume rbuf is invalid for all other ranks, so fix it up * here to be valid on all non-leaf ranks */ - free_rbuf = (char*)malloc(true_extent + (count - 1) * extent); + free_rbuf = (char*)malloc(dsize); if (NULL == free_rbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } - rbuf = free_rbuf - true_lb; + rbuf = free_rbuf - gap; } /* Loop over cube dimensions. High processes send to low ones in the @@ -288,7 +288,7 @@ mca_coll_basic_reduce_lin_inter(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int i, err, size; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t dsize, gap; char *free_buffer = NULL; char *pml_buffer = NULL; @@ -305,14 +305,13 @@ mca_coll_basic_reduce_lin_inter(const void *sbuf, void *rbuf, int count, MCA_PML_BASE_SEND_STANDARD, comm)); } else { /* Root receives and reduces messages */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = free_buffer - true_lb; + pml_buffer = free_buffer - gap; /* Initialize the receive buffer. */ diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter.c index 43be9ba6ea..d8e9cc8a0d 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter.c @@ -71,7 +71,7 @@ mca_coll_basic_reduce_scatter_intra(const void *sbuf, void *rbuf, const int *rco mca_coll_base_module_t *module) { int i, rank, size, count, err = OMPI_SUCCESS; - ptrdiff_t true_lb, true_extent, lb, extent, buf_size; + ptrdiff_t extent, buf_size, gap; int *disps = NULL; char *recv_buf = NULL, *recv_buf_free = NULL; char *result_buf = NULL, *result_buf_free = NULL; @@ -96,9 +96,8 @@ mca_coll_basic_reduce_scatter_intra(const void *sbuf, void *rbuf, const int *rco } /* get datatype information */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - buf_size = true_extent + (count - 1) * extent; + ompi_datatype_type_extent(dtype, &extent); + buf_size = opal_datatype_span(&dtype->super, count, &gap); /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { @@ -111,7 +110,7 @@ mca_coll_basic_reduce_scatter_intra(const void *sbuf, void *rbuf, const int *rco /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - true_lb; + recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; @@ -119,7 +118,7 @@ mca_coll_basic_reduce_scatter_intra(const void *sbuf, void *rbuf, const int *rco /* allocate temporary buffer for results */ result_buf_free = (char*) malloc(buf_size); - result_buf = result_buf_free - true_lb; + result_buf = result_buf_free - gap; /* copy local buffer into the temporary results */ err = ompi_datatype_sndrcv(sbuf, count, dtype, result_buf, count, dtype); @@ -323,7 +322,7 @@ mca_coll_basic_reduce_scatter_intra(const void *sbuf, void *rbuf, const int *rco /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - true_lb; + recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c index 9d2b3a4d5d..fca39e5d51 100644 --- a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c +++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c @@ -58,7 +58,7 @@ mca_coll_basic_reduce_scatter_block_intra(const void *sbuf, void *rbuf, int rcou mca_coll_base_module_t *module) { int rank, size, count, err = OMPI_SUCCESS; - ptrdiff_t true_lb, true_extent, lb, extent, buf_size; + ptrdiff_t extent, buf_size, gap; char *recv_buf = NULL, *recv_buf_free = NULL; /* Initialize */ @@ -72,9 +72,8 @@ mca_coll_basic_reduce_scatter_block_intra(const void *sbuf, void *rbuf, int rcou } /* get datatype information */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - buf_size = true_extent + (count - 1) * extent; + ompi_datatype_type_extent(dtype, &extent); + buf_size = opal_datatype_span(&dtype->super, count, &gap); /* Handle MPI_IN_PLACE */ if (MPI_IN_PLACE == sbuf) { @@ -85,7 +84,7 @@ mca_coll_basic_reduce_scatter_block_intra(const void *sbuf, void *rbuf, int rcou /* temporary receive buffer. See coll_basic_reduce.c for details on sizing */ recv_buf_free = (char*) malloc(buf_size); - recv_buf = recv_buf_free - true_lb; + recv_buf = recv_buf_free - gap; if (NULL == recv_buf_free) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup; diff --git a/ompi/mca/coll/basic/coll_basic_scan.c b/ompi/mca/coll/basic/coll_basic_scan.c index e74cc3fac8..17ac9517e3 100644 --- a/ompi/mca/coll/basic/coll_basic_scan.c +++ b/ompi/mca/coll/basic/coll_basic_scan.c @@ -47,7 +47,7 @@ mca_coll_basic_scan_intra(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int size, rank, err; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t dsize, gap; char *free_buffer = NULL; char *pml_buffer = NULL; @@ -74,14 +74,11 @@ mca_coll_basic_scan_intra(const void *sbuf, void *rbuf, int count, * listed in coll_basic_reduce.c. Use this temporary buffer to * receive into, later. */ - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - - free_buffer = (char*)malloc(true_extent + (count - 1) * extent); + dsize = opal_datatype_span(&dtype->super, count, &gap); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - pml_buffer = free_buffer - true_lb; + pml_buffer = free_buffer - gap; /* Copy the send buffer into the receive buffer. */ diff --git a/ompi/mca/coll/cuda/coll_cuda_allreduce.c b/ompi/mca/coll/cuda/coll_cuda_allreduce.c index 05e2c3910a..1606bcdf92 100644 --- a/ompi/mca/coll/cuda/coll_cuda_allreduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_allreduce.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. @@ -34,15 +34,14 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; + bufsize = opal_datatype_span(&dtype->super, count, &gap); + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { @@ -50,7 +49,7 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - true_lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -61,7 +60,7 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - true_lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_allreduce(sbuf, rbuf, count, dtype, op, comm, s->c_coll.coll_allreduce_module); if (NULL != sbuf1) { diff --git a/ompi/mca/coll/cuda/coll_cuda_exscan.c b/ompi/mca/coll/cuda/coll_cuda_exscan.c index 1f93722d62..bc336341ac 100644 --- a/ompi/mca/coll/cuda/coll_cuda_exscan.c +++ b/ompi/mca/coll/cuda/coll_cuda_exscan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. @@ -26,15 +26,14 @@ int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; + bufsize = opal_datatype_span(&dtype->super, count, &gap); + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { @@ -42,7 +41,7 @@ int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - true_lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -53,7 +52,7 @@ int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - true_lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_exscan(sbuf, rbuf, count, dtype, op, comm, diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c index 1cd667f30b..2bcce13c75 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. @@ -34,15 +34,15 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; + bufsize = opal_datatype_span(&dtype->super, count, &gap); + + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { @@ -50,7 +50,7 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -61,7 +61,7 @@ mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_reduce((void *) sbuf, rbuf, count, dtype, op, root, comm, diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c index dab1d86ce4..0dccbc580f 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. @@ -38,16 +38,16 @@ mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t sbufsize, rbufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - sbufsize = (true_extent + (ptrdiff_t)(rcount - 1) * extent) * ompi_comm_size(comm); - rbufsize = true_extent + (ptrdiff_t)(rcount - 1) * extent; + rbufsize = opal_datatype_span(&dtype->super, rcount, &gap); + + sbufsize = rbufsize * ompi_comm_size(comm); + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(sbufsize); if (NULL == sbuf1) { @@ -55,7 +55,7 @@ mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount, } opal_cuda_memcpy_sync(sbuf1, sbuf, sbufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - true_lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -66,7 +66,7 @@ mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount, } opal_cuda_memcpy_sync(rbuf1, rbuf, rbufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - true_lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_reduce_scatter_block(sbuf, rbuf, rcount, dtype, op, comm, s->c_coll.coll_reduce_scatter_block_module); diff --git a/ompi/mca/coll/cuda/coll_cuda_scan.c b/ompi/mca/coll/cuda/coll_cuda_scan.c index 6cfb06a391..e9afde8107 100644 --- a/ompi/mca/coll/cuda/coll_cuda_scan.c +++ b/ompi/mca/coll/cuda/coll_cuda_scan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The University of Tennessee and The University + * Copyright (c) 2014-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved. @@ -33,15 +33,14 @@ int mca_coll_cuda_scan(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t gap; char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL; const char *sbuf2; size_t bufsize; int rc; - ompi_datatype_get_extent(dtype, &lb, &extent); - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); - bufsize = true_extent + (ptrdiff_t)(count - 1) * extent; + bufsize = opal_datatype_span(&dtype->super, count, &gap); + if ((MPI_IN_PLACE != sbuf) && (opal_cuda_check_bufs((char *)sbuf, NULL))) { sbuf1 = (char*)malloc(bufsize); if (NULL == sbuf1) { @@ -49,7 +48,7 @@ int mca_coll_cuda_scan(const void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(sbuf1, sbuf, bufsize); sbuf2 = sbuf; /* save away original buffer */ - sbuf = sbuf1 - true_lb; + sbuf = sbuf1 - gap; } if (opal_cuda_check_bufs(rbuf, NULL)) { @@ -60,7 +59,7 @@ int mca_coll_cuda_scan(const void *sbuf, void *rbuf, int count, } opal_cuda_memcpy_sync(rbuf1, rbuf, bufsize); rbuf2 = rbuf; /* save away original buffer */ - rbuf = rbuf1 - true_lb; + rbuf = rbuf1 - gap; } rc = s->c_coll.coll_scan(sbuf, rbuf, count, dtype, op, comm, s->c_coll.coll_scan_module); diff --git a/ompi/mca/coll/sm/coll_sm_reduce.c b/ompi/mca/coll/sm/coll_sm_reduce.c index d60f029b07..c731b87d2b 100644 --- a/ompi/mca/coll/sm/coll_sm_reduce.c +++ b/ompi/mca/coll/sm/coll_sm_reduce.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -187,9 +187,9 @@ static int reduce_inorder(const void *sbuf, void* rbuf, int count, size_t total_size, max_data, bytes; mca_coll_sm_in_use_flag_t *flag; mca_coll_sm_data_index_t *index; - size_t ddt_size; + size_t ddt_size, segsize; size_t segment_ddt_count, segment_ddt_bytes, zero = 0; - ptrdiff_t true_lb, true_extent, lb, extent; + ptrdiff_t extent, gap; /* Setup some identities */ @@ -205,10 +205,7 @@ static int reduce_inorder(const void *sbuf, void* rbuf, int count, /* ddt_size is the packed size (e.g., MPI_SHORT_INT is 6) */ ompi_datatype_type_size(dtype, &ddt_size); /* extent is from lb to ub (e.g., MPI_SHORT_INT is 8) */ - ompi_datatype_get_extent(dtype, &lb, &extent); - /* true_extent is extent of actual type map, ignoring lb and ub - (e.g., MPI_SHORT_INT is 8) */ - ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); + ompi_datatype_type_extent(dtype, &extent); segment_ddt_count = mca_coll_sm_component.sm_fragment_size / ddt_size; iov.iov_len = segment_ddt_bytes = segment_ddt_count * ddt_size; total_size = ddt_size * count; @@ -266,12 +263,13 @@ static int reduce_inorder(const void *sbuf, void* rbuf, int count, "segment_ddt_count" instances (i.e., the number of instances that can be held in a single fragment) */ - free_buffer = (char*)malloc(true_extent + - (segment_ddt_count - 1) * extent); + segsize = opal_datatype_span(&dtype->super, segment_ddt_count, &gap); + + free_buffer = (char*)malloc(segsize); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - reduce_temp_buffer = free_buffer - true_lb; + reduce_temp_buffer = free_buffer - gap; /* Trickery here: we use a potentially smaller count than the user count -- use the largest count that is <= @@ -312,15 +310,16 @@ static int reduce_inorder(const void *sbuf, void* rbuf, int count, as the sbuf */ if (MPI_IN_PLACE == sbuf && (size - 1) != rank) { - inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); + segsize = opal_datatype_span(&dtype->super, count, &gap); + inplace_temp = (char*)malloc(segsize); if (NULL == inplace_temp) { if (NULL != free_buffer) { free(free_buffer); } return OMPI_ERR_OUT_OF_RESOURCE; } - sbuf = inplace_temp - true_lb; - ompi_datatype_copy_content_same_ddt(dtype, count, (char *) sbuf, (char *) rbuf); + sbuf = inplace_temp - gap; + ompi_datatype_copy_content_same_ddt(dtype, count, (char *)sbuf, (char *)rbuf); } else { inplace_temp = NULL; } diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h index cf00a690c5..25f014ead0 100644 --- a/opal/datatype/opal_datatype.h +++ b/opal/datatype/opal_datatype.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2010 The University of Tennessee and The University + * Copyright (c) 2004-2015 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -329,6 +329,25 @@ OPAL_DECLSPEC opal_datatype_t* opal_datatype_create_from_packed_description( void** packed_buffer, struct opal_proc_t* remote_processor ); +/* Compute the span in memory of count datatypes. This function help with temporary + * memory allocations for receiving already typed data (such as those used for reduce + * operations). This span is the distance between the minimum and the maximum byte + * in the memory layout of count datatypes, or in other terms the memory needed to + * allocate count times the datatype without the gap in the beginning and at the end. + * + * Returns: the memory span of count repetition of the datatype, and in the gap + * argument, the number of bytes of the gap at the beginning. + */ +static inline OPAL_PTRDIFF_TYPE +opal_datatype_span( const opal_datatype_t* pData, int64_t count, + OPAL_PTRDIFF_TYPE* gap) +{ + OPAL_PTRDIFF_TYPE extent = (pData->ub - pData->lb); + OPAL_PTRDIFF_TYPE true_extent = (pData->true_ub - pData->true_lb); + *gap = pData->true_lb; + return true_extent + (count - 1) * extent; +} + #if OPAL_ENABLE_DEBUG /* * Set a breakpoint to this function in your favorite debugger