From 012a00480616cfd30c91de50635c0718d5cde72d Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 20 May 2019 11:43:29 -0400 Subject: [PATCH] Clean and sync the pack and unpack functions. - optimize handling of contiguous with gaps datatypes. - fixes a performance issue for all datatypes with a count of 1. - optimize the pack/unpack of contiguous with gaps datatype. - optimize the case of blocklen == 1 Signed-off-by: George Bosilca --- opal/datatype/opal_convertor_raw.c | 6 +- opal/datatype/opal_datatype_copy.h | 32 ++-- opal/datatype/opal_datatype_module.c | 1 - opal/datatype/opal_datatype_pack.c | 214 ++++++++++--------------- opal/datatype/opal_datatype_pack.h | 114 +++++++------ opal/datatype/opal_datatype_position.c | 60 ++++--- opal/datatype/opal_datatype_unpack.c | 124 +++++++------- opal/datatype/opal_datatype_unpack.h | 118 +++++++------- 8 files changed, 316 insertions(+), 353 deletions(-) diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c index df2340122a..893792583f 100644 --- a/opal/datatype/opal_convertor_raw.c +++ b/opal/datatype/opal_convertor_raw.c @@ -31,8 +31,8 @@ #endif /* OPAL_ENABLE_DEBUG */ /* Take a new iovec (base + len) and try to merge it with what we already - * have. If we succeed return 0 and move forward, if not save it into a new - * iovec location. If we need to go to a new position and we reach the end + * have. If we succeed return 0 and move forward, otherwise save it into a new + * iovec location. If we need to advance position and we reach the end * of the iovec array, return 1 to signal we did not saved the last iovec. */ static inline int @@ -46,7 +46,7 @@ opal_convertor_merge_iov( struct iovec* iov, uint32_t* iov_count, return 0; } /* cannot merge, move to the next position */ *idx = *idx + 1; - if( *idx == *iov_count ) return 1; /* do not overwrite outside the iove array boundaries */ + if( *idx == *iov_count ) return 1; /* do not overwrite outside the iovec array boundaries */ } iov[*idx].iov_base = base; iov[*idx].iov_len = len; diff --git a/opal/datatype/opal_datatype_copy.h b/opal/datatype/opal_datatype_copy.h index 40f119a684..11058012e1 100644 --- a/opal/datatype/opal_datatype_copy.h +++ b/opal/datatype/opal_datatype_copy.h @@ -51,11 +51,9 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM, const ddt_elem_desc_t* _elem = &((ELEM)->elem); unsigned char* _source = (SOURCE) + _elem->disp; unsigned char* _destination = (DESTINATION) + _elem->disp; - size_t total_count = _elem->count * _elem->blocklen; - size_t do_now, do_now_bytes; + size_t do_now = _elem->count, do_now_bytes; - assert( (COUNT) == total_count); - assert( total_count <= ((*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size) ); + assert( (COUNT) == (do_now * _elem->blocklen)); /* We don't a prologue and epilogue here as we are __always__ working * with full copies of the data description. @@ -64,21 +62,19 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM, /** * Compute how many full blocklen we need to do and do them. */ - do_now = _elem->count; - if( 0 != do_now ) { - do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; - for(size_t _i = 0; _i < do_now; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", - STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) ); ); - MEM_OP( _destination, _source, do_now_bytes ); - _destination += _elem->extent; - _source += _elem->extent; - *(SPACE) -= do_now_bytes; - } - (COUNT) -= total_count; + do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; + assert( (do_now * do_now_bytes) <= (*SPACE) ); + + for(size_t _i = 0; _i < do_now; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", + STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) - _i * do_now_bytes ); ); + MEM_OP( _destination, _source, do_now_bytes ); + _destination += _elem->extent; + _source += _elem->extent; } + *(SPACE) -= (do_now_bytes * do_now); } static inline void _contiguous_loop( const dt_elem_desc_t* ELEM, diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c index 7976392b63..d4415b21ef 100644 --- a/opal/datatype/opal_datatype_module.c +++ b/opal/datatype/opal_datatype_module.c @@ -224,7 +224,6 @@ int32_t opal_datatype_init( void ) OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS; datatype->desc.desc[0].elem.common.type = i; - /* datatype->desc.desc[0].elem.blocklen XXX not set at the moment, it will be needed later */ datatype->desc.desc[0].elem.count = 1; datatype->desc.desc[0].elem.blocklen = 1; datatype->desc.desc[0].elem.disp = 0; diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index 55889fcaa5..cf69f6ada2 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -53,8 +53,6 @@ #endif /* defined(CHECKSUM) */ -#define IOVEC_MEM_LIMIT 8192 - /* the contig versions does not use the stack. They can easily retrieve * the status with just the informations from pConvertor->bConverted. */ @@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv, unsigned char *source_base = NULL; uint32_t iov_count; size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted; - ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; - source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp); + source_base = (pConv->pBaseBuf + pConv->pDesc->true_lb + pStack[0].disp + pStack[1].disp); /* There are some optimizations that can be done if the upper level * does not provide a buffer. @@ -111,155 +108,116 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv, uint32_t* out_size, size_t* max_data ) { + size_t remaining, length, initial_bytes_converted = pConv->bConverted; const opal_datatype_t* pData = pConv->pDesc; dt_stack_t* stack = pConv->pStack; + ptrdiff_t extent = pData->ub - pData->lb; unsigned char *user_memory, *packed_buffer; - uint32_t iov_count, index; + uint32_t idx; size_t i; - size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted; - ptrdiff_t extent= pData->ub - pData->lb; - ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; + /* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb + * is the initial displacement, the size the length of the contiguous area and the extent represent + * how much we should jump between elements. + */ assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) ); DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", (void*)pConv->pBaseBuf, *out_size ); ); if( stack[1].type != opal_datatype_uint1.id ) { stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; - stack[1].type = opal_datatype_uint1.id; + stack[1].type = opal_datatype_uint1.id; + } + /* We can provide directly the pointers in the user buffers (like the convertor_raw) */ + if( NULL == iov[0].iov_base ) { + user_memory = pConv->pBaseBuf + pData->true_lb; + + for( idx = 0; (idx < (*out_size)) && stack[0].count; idx++ ) { + iov[idx].iov_base = user_memory + stack[0].disp + stack[1].disp; + iov[idx].iov_len = stack[1].count; + COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv ); + + pConv->bConverted += stack[1].count; + + stack[0].disp += extent; + stack[0].count--; + stack[1].disp = 0; + stack[1].count = pData->size; /* we might need this to update the partial + * length for the first iteration */ + } + goto update_status_and_return; } - /* There are some optimizations that can be done if the upper level - * does not provide a buffer. - */ - for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { + for( idx = 0; idx < (*out_size); idx++ ) { /* Limit the amount of packed data to the data left over on this convertor */ remaining = pConv->local_size - pConv->bConverted; if( 0 == remaining ) break; /* we're done this time */ - if( remaining > iov[iov_count].iov_len ) - remaining = iov[iov_count].iov_len; - packed_buffer = (unsigned char *)iov[iov_count].iov_base; - bConverted = remaining; /* how much will get unpacked this time */ - user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp + stack[1].disp; - i = pConv->count - stack[0].count; /* how many we already packed */ - assert(i == (pConv->bConverted / pData->size)); + if( remaining > iov[idx].iov_len ) + remaining = iov[idx].iov_len; + packed_buffer = (unsigned char *)iov[idx].iov_base; + pConv->bConverted += remaining; + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp; - if( packed_buffer == NULL ) { - /* special case for small data. We avoid allocating memory if we - * can fill the iovec directly with the address of the remaining - * data. - */ - if( stack->count < (size_t)((*out_size) - iov_count) ) { - stack[1].count = pData->size - (pConv->bConverted % pData->size); - for( index = iov_count; i < pConv->count; i++, index++ ) { - iov[index].iov_base = (IOVBASE_TYPE *) user_memory; - iov[index].iov_len = stack[1].count; - stack[0].disp += extent; - pConv->bConverted += stack[1].count; - stack[1].disp = 0; /* reset it for the next round */ - stack[1].count = pData->size; - user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp; - COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv ); - } - *out_size = iov_count + index; - *max_data = (pConv->bConverted - initial_bytes_converted); - pConv->flags |= CONVERTOR_COMPLETED; - return 1; /* we're done */ - } - /* now special case for big contiguous data with gaps around */ - if( pData->size >= IOVEC_MEM_LIMIT ) { - /* as we dont have to copy any data, we can simply fill the iovecs - * with data from the user data description. - */ - for( index = iov_count; (i < pConv->count) && (index < (*out_size)); - i++, index++ ) { - if( remaining < pData->size ) { - iov[index].iov_base = (IOVBASE_TYPE *) user_memory; - iov[index].iov_len = remaining; - remaining = 0; - COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv ); - break; - } else { - iov[index].iov_base = (IOVBASE_TYPE *) user_memory; - iov[index].iov_len = pData->size; - user_memory += extent; - COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv ); - } - remaining -= iov[index].iov_len; - pConv->bConverted += iov[index].iov_len; - } - *out_size = index; - *max_data = (pConv->bConverted - initial_bytes_converted); - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; - } - return 0; + DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n", + (void*)user_memory, (void*)packed_buffer, remaining ); ); + + length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */ + /* data left from last round and enough space in the buffer */ + if( (pData->size != length) && (length <= remaining)) { + /* copy the partial left-over from the previous round */ + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [prologue]\n", + (void*)user_memory, (void*)packed_buffer, length ); ); + MEMCPY_CSUM( packed_buffer, user_memory, length, pConv ); + packed_buffer += length; + remaining -= length; + stack[1].count -= length; + stack[1].disp += length; /* just in case, we overwrite this below */ + if( 0 == stack[1].count) { /* one completed element */ + stack[0].count--; + stack[0].disp += extent; + if( 0 == stack[0].count ) /* not yet done */ + break; + stack[1].count = pData->size; + stack[1].disp = 0; } + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp; } - { - DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + for( i = 0; pData->size <= remaining; i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n", + (void*)user_memory, (void*)packed_buffer, pData->size, remaining, iov[idx].iov_len ); ); + MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv ); + packed_buffer += pData->size; + user_memory += extent; + remaining -= pData->size; + } + stack[0].count -= i; /* the entire datatype copied above */ + stack[0].disp += (i * extent); - length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */ - /* data left from last round and enough space in the buffer */ - if( (0 != length) && (length <= remaining)) { - /* copy the partial left-over from the previous round */ - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "2. pack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); - MEMCPY_CSUM( packed_buffer, user_memory, length, pConv ); - packed_buffer += length; - user_memory += (extent - pData->size + length); - remaining -= length; - stack[1].count -= length; - if( 0 == stack[1].count) { /* one completed element */ - stack[0].count--; - stack[0].disp += extent; - if( 0 != stack[0].count ) { /* not yet done */ - stack[1].count = pData->size; - stack[1].disp = 0; - } - } - } - for( i = 0; pData->size <= remaining; i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "3. pack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); ); - MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv ); - packed_buffer += pData->size; - user_memory += extent; - remaining -= pData->size; - } - stack[0].count -= i; /* the filled up and the entire types */ - stack[0].disp += (i * extent); - stack[1].disp += remaining; - /* Copy the last bits */ - if( 0 != remaining ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); - MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv ); - user_memory += remaining; - stack[1].count -= remaining; - } + /* Copy the last bits */ + if( 0 != remaining ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, + pData, pConv->count ); + DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %" PRIsize_t "\n", + (void*)user_memory, (void*)packed_buffer, remaining ); ); + MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv ); + stack[1].count -= remaining; + stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */ if( 0 == stack[1].count ) { /* prepare for the next element */ stack[1].count = pData->size; stack[1].disp = 0; } } - pConv->bConverted += bConverted; } - *out_size = iov_count; - *max_data = (pConv->bConverted - initial_bytes_converted); - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; - } - return 0; + + update_status_and_return: + *out_size = idx; + *max_data = pConv->bConverted - initial_bytes_converted; + if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED; + return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */ } /* The pack/unpack functions need a cleanup. I have to create a proper interface to access diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index 66259f8b66..514f8bd7b0 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -35,82 +35,90 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, size_t* SPACE ) { const ddt_elem_desc_t* _elem = &((ELEM)->elem); - size_t total_count = _elem->count * _elem->blocklen; size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; size_t do_now, do_now_bytes; + size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; assert( *(COUNT) <= _elem->count * _elem->blocklen); if( cando_count > *(COUNT) ) cando_count = *(COUNT); - /** - * First check if we already did something on this element ? - */ - do_now = (total_count - *(COUNT)); /* done elements */ - if( 0 != do_now ) { - do_now = do_now % _elem->blocklen; /* partial blocklen? */ - - if( 0 != do_now ) { - size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ - do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", - (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); - _memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; - /* compensate if we just completed a blocklen */ - if( do_now == left_in_block ) - _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); - *(packed) += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; - cando_count -= do_now; + if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ + *(COUNT) -= cando_count; + for(; cando_count > 0; cando_count--) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; } + goto update_and_return; + } + blocklen_bytes *= _elem->blocklen; + + /** + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). + */ + do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */ + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + if( 0 != do_now ) { + size_t left_in_block = do_now; /* left in the current blocklen */ + do_now = (do_now > cando_count ) ? cando_count : do_now; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + _packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); + _memory += (ptrdiff_t)do_now_bytes; + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - blocklen_bytes; + _packed += do_now_bytes; + cando_count -= do_now; } - /** - * Compute how many full blocklen we need to do and do them. - */ - do_now = cando_count / _elem->blocklen; - if( 0 != do_now ) { - do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; - for(size_t _i = 0; _i < do_now; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); - MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); - *(packed) += do_now_bytes; - _memory += _elem->extent; - *(SPACE) -= do_now_bytes; - *(COUNT) -= _elem->blocklen; - cando_count -= _elem->blocklen; - } + /* Do as many full blocklen as possible */ + for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; } /** * As an epilog do anything left from the last blocklen. */ - do_now = cando_count; - if( 0 != do_now ) { - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + if( 0 != cando_count ) { + assert( cando_count < _elem->blocklen ); + do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); + (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", - (void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); + (void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); _memory += do_now_bytes; - *(packed) += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; + _packed += do_now_bytes; } + update_and_return: *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; } static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, diff --git a/opal/datatype/opal_datatype_position.c b/opal/datatype/opal_datatype_position.c index 381a31086d..f8137c7e0c 100644 --- a/opal/datatype/opal_datatype_position.c +++ b/opal/datatype/opal_datatype_position.c @@ -49,10 +49,24 @@ * - the DT_CONTIGUOUS flag for the type OPAL_DATATYPE_END_LOOP is meaningless. */ +static inline void +position_single_block(opal_convertor_t* CONVERTOR, + unsigned char** mem, ptrdiff_t mem_update, + size_t* space, size_t space_update, + size_t* cnt, size_t cnt_update) +{ + OPAL_DATATYPE_SAFEGUARD_POINTER( *mem, mem_update, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n", + (void*)*mem, (unsigned long)space_update, (unsigned long)(*space) ); ); + *mem += mem_update; + *space -= space_update; + *cnt -= cnt_update; +} + /** - * Advance the current position in the convertor based using the - * current element and a left-over counter. Update the head pointer - * and the leftover byte space. + * Advance the convertors' position according. Update the pointer and the remaining space + * accordingly. */ static inline void position_predefined_data( opal_convertor_t* CONVERTOR, @@ -64,7 +78,7 @@ position_predefined_data( opal_convertor_t* CONVERTOR, const ddt_elem_desc_t* _elem = &((ELEM)->elem); size_t total_count = _elem->count * _elem->blocklen; size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; - size_t do_now, do_now_bytes; + size_t do_now, do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; unsigned char* _memory = (*POINTER) + _elem->disp; assert( *(COUNT) <= _elem->count * _elem->blocklen); @@ -72,6 +86,15 @@ position_predefined_data( opal_convertor_t* CONVERTOR, if( cando_count > *(COUNT) ) cando_count = *(COUNT); + if( 1 == _elem->blocklen ) { + DO_DEBUG( opal_output( 0, "position( %p, %" PRIsize_t " ) x (count %" PRIsize_t ", extent %ld) => space %lu [prolog]\n", + (void*)_memory, (unsigned long)do_now_bytes, cando_count, _elem->extent, (unsigned long)(*SPACE) ); ); + _memory += cando_count * _elem->extent; + *SPACE -= cando_count * do_now_bytes; + *COUNT -= cando_count; + goto update_and_return; + } + /** * First check if we already did something on this element ? */ @@ -84,16 +107,12 @@ position_predefined_data( opal_convertor_t* CONVERTOR, do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n", - (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - _memory = *(POINTER) + _elem->disp + (ptrdiff_t)do_now_bytes; + position_single_block( CONVERTOR, &_memory, do_now_bytes, + SPACE, do_now_bytes, COUNT, do_now ); + /* compensate if we just completed a blocklen */ if( do_now == left_in_block ) _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; cando_count -= do_now; } } @@ -105,13 +124,8 @@ position_predefined_data( opal_convertor_t* CONVERTOR, if( 0 != do_now ) { do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; for(size_t _i = 0; _i < do_now; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu\n", - (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); - _memory += _elem->extent; - *(SPACE) -= do_now_bytes; - *(COUNT) -= _elem->blocklen; + position_single_block( CONVERTOR, &_memory, _elem->extent, + SPACE, do_now_bytes, COUNT, _elem->blocklen ); cando_count -= _elem->blocklen; } } @@ -122,15 +136,11 @@ position_predefined_data( opal_convertor_t* CONVERTOR, do_now = cando_count; if( 0 != do_now ) { do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [epilog]\n", - (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - _memory += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; + position_single_block( CONVERTOR, &_memory, do_now_bytes, + SPACE, do_now_bytes, COUNT, do_now ); } + update_and_return: *(POINTER) = _memory - _elem->disp; } diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index 3edb916192..ac35a03c26 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -70,98 +70,82 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv, { const opal_datatype_t *pData = pConv->pDesc; unsigned char *user_memory, *packed_buffer; - uint32_t iov_count, i; - size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted; + uint32_t iov_idx, i; + size_t remaining, initial_bytes_converted = pConv->bConverted; dt_stack_t* stack = pConv->pStack; ptrdiff_t extent = pData->ub - pData->lb; - ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; - DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", + DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov count %d )\n", (void*)pConv->pBaseBuf, *out_size ); ); if( stack[1].type != opal_datatype_uint1.id ) { stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; stack[1].type = opal_datatype_uint1.id; } - for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { - remaining = pConv->local_size - pConv->bConverted; - if( 0 == remaining ) break; /* we're done this time */ - if( remaining > iov[iov_count].iov_len ) - remaining = iov[iov_count].iov_len; - packed_buffer = (unsigned char*)iov[iov_count].iov_base; - bConverted = remaining; /* how much will get unpacked this time */ - user_memory = pConv->pBaseBuf + initial_displ; - if( (ptrdiff_t)pData->size == extent ) { - user_memory += pConv->bConverted; - DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + if( (ptrdiff_t)pData->size == extent ) { + for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) { + remaining = pConv->local_size - pConv->bConverted; + if( 0 == remaining ) break; /* we're done this time */ + if( remaining > iov[iov_idx].iov_len ) + remaining = iov[iov_idx].iov_len; + + packed_buffer = (unsigned char*)iov[iov_idx].iov_base; + user_memory = pConv->pBaseBuf + pData->true_lb + pConv->bConverted; /* contiguous data or basic datatype with count */ OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "1. unpack contig dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + DO_DEBUG( opal_output( 0, "unpack contig [%d] dest %p src %p length %" PRIsize_t "\n", + iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); - } else { - user_memory += stack[0].disp + stack[1].disp; + pConv->bConverted += remaining; /* how much will get unpacked this time */ + } + } else { + for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) { + remaining = pConv->local_size - pConv->bConverted; + if( 0 == remaining ) break; /* we're done this time */ + if( remaining > iov[iov_idx].iov_len ) + remaining = iov[iov_idx].iov_len; - DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + packed_buffer = (unsigned char*)iov[iov_idx].iov_base; + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp; + pConv->bConverted += remaining; /* how much will get unpacked this time */ - length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last unpack */ - /* complete the last copy */ - if( (0 != length) && (length <= remaining) ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, + for( i = 0; stack[1].count <= remaining; i++ ) { /* partial or full data */ + OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, stack[1].count, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "2. unpack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); - MEMCPY_CSUM( user_memory, packed_buffer, length, pConv ); - packed_buffer += length; - user_memory += (extent - (pData->size - length)); - remaining -= length; - stack[1].count -= length; - if( 0 == stack[1].count) { /* one completed element */ - stack[0].count--; - stack[0].disp += extent; - if( 0 != stack[0].count ) { /* not yet done */ - stack[1].count = pData->size; - stack[1].disp = 0; - } - } + DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [%d]\n", + iov_idx, (void*)user_memory, (void*)packed_buffer, stack[1].count, i ); ); + MEMCPY_CSUM( user_memory, packed_buffer, stack[1].count, pConv ); + + packed_buffer += stack[1].count; + remaining -= stack[1].count; + + stack[0].count--; + stack[0].disp += extent; + stack[1].count = pData->size; + stack[1].disp = 0; + + user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp; } - for( i = 0; pData->size <= remaining; i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, - pData, pConv->count ); - DO_DEBUG( opal_output( 0, "3. unpack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); ); - MEMCPY_CSUM( user_memory, packed_buffer, pData->size, pConv ); - packed_buffer += pData->size; - user_memory += extent; - remaining -= pData->size; - } - stack[0].count -= i; - stack[0].disp += (i * extent); - stack[1].disp += remaining; - /* copy the last bits */ + + /* Copy the last bits */ if( 0 != remaining ) { OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "4. unpack dest %p src %p length %lu\n", - (void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); + DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [epilog]\n", + iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); - user_memory += remaining; stack[1].count -= remaining; + stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */ + assert( stack[1].count ); } } - pConv->bConverted += bConverted; } - *out_size = iov_count; /* we only reach this line after the for loop succesfully complete */ - *max_data = (pConv->bConverted - initial_bytes_converted); - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; - } - return 0; + *out_size = iov_idx; /* we only reach this line after the for loop succesfully complete */ + *max_data = pConv->bConverted - initial_bytes_converted; + if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED; + return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */ } /** @@ -179,7 +163,7 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv, static inline void opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pElem, unsigned char* partial_data, - ptrdiff_t start_position, ptrdiff_t length, + ptrdiff_t start_position, size_t length, unsigned char** user_buffer ) { char unused_byte = 0x7F, saved_data[16]; @@ -195,7 +179,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle /* Find a byte that is not used in the partial buffer */ find_unused_byte: - for(ptrdiff_t i = 0; i < length; i++ ) { + for(size_t i = 0; i < length; i++ ) { if( unused_byte == partial_data[i] ) { unused_byte--; goto find_unused_byte; @@ -306,7 +290,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, COMPUTE_CSUM( iov_ptr, missing_length, pConvertor ); opal_unpack_partial_datatype( pConvertor, pElem, iov_ptr, - pConvertor->partial_length, element_length - pConvertor->partial_length, + pConvertor->partial_length, (size_t)(element_length - pConvertor->partial_length), &conv_ptr ); --count_desc; if( 0 == count_desc ) { diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index f51a609294..5a3679bc37 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -35,82 +35,90 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, size_t* SPACE ) { const ddt_elem_desc_t* _elem = &((ELEM)->elem); - size_t total_count = _elem->count * _elem->blocklen; size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; size_t do_now, do_now_bytes; + size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; - assert( *(COUNT) <= _elem->count * _elem->blocklen); + assert( *(COUNT) <= (_elem->count * _elem->blocklen)); if( cando_count > *(COUNT) ) cando_count = *(COUNT); - /** - * First check if we already did something on this element ? - */ - do_now = (total_count - *(COUNT)); /* done elements */ - if( 0 != do_now ) { - do_now = do_now % _elem->blocklen; /* partial blocklen? */ - - if( 0 != do_now ) { - size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ - do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", - (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); - _memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; - /* compensate if we just completed a blocklen */ - if( do_now == left_in_block ) - _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); - *(packed) += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; - cando_count -= do_now; + if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ + *(COUNT) -= cando_count; + for(; cando_count > 0; cando_count--) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; } + goto update_and_return; + } + blocklen_bytes *= _elem->blocklen; + + /** + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). + */ + do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */ + /* premptively update the number of COUNT we will return. */ + *(COUNT) -= cando_count; + if( 0 != do_now ) { + size_t left_in_block = do_now; /* left in the current blocklen */ + do_now = (do_now > cando_count ) ? cando_count : do_now; + do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); + _memory += (ptrdiff_t)do_now_bytes; + /* compensate if we just completed a blocklen */ + if( do_now == left_in_block ) + _memory += _elem->extent - blocklen_bytes; + _packed += do_now_bytes; + cando_count -= do_now; } - /** - * Compute how many full blocklen we need to do and do them. - */ - do_now = cando_count / _elem->blocklen; - if( 0 != do_now ) { - do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; - for(size_t _i = 0; _i < do_now; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); - MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); - *(packed) += do_now_bytes; - _memory += _elem->extent; - *(SPACE) -= do_now_bytes; - *(COUNT) -= _elem->blocklen; - cando_count -= _elem->blocklen; - } + /* Do as many full blocklen as possible */ + for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; } /** * As an epilog do anything left from the last blocklen. */ - do_now = cando_count; - if( 0 != do_now ) { - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; + if( 0 != cando_count ) { + assert( cando_count < _elem->blocklen ); + do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", - (void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", + (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); _memory += do_now_bytes; - *(packed) += do_now_bytes; - *(SPACE) -= do_now_bytes; - *(COUNT) -= do_now; + _packed += do_now_bytes; } + update_and_return: *(memory) = _memory - _elem->disp; + *(SPACE) -= (_packed - *packed); + *(packed) = _packed; } static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR,