From f78d3d52cd32846fab0cceeb624a1f51caaa9fca Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 10 Jul 2019 11:30:59 -0400 Subject: [PATCH] Optimize the pack/unpack. Start optimizing the code. This commit divides the operations in 2 parts, the first, outside the critical part, deals with partial blocks of predefined elements, and the second, inside the critical path, only deals with full blocks of elements. This reduces the number of expensive operations in the critical path and results in a decent performance increase. Signed-off-by: George Bosilca --- opal/datatype/opal_datatype_pack.c | 30 ++++-- opal/datatype/opal_datatype_pack.h | 135 ++++++++++++++++---------- opal/datatype/opal_datatype_unpack.c | 82 +++++++++------- opal/datatype/opal_datatype_unpack.h | 140 +++++++++++++++++---------- 4 files changed, 238 insertions(+), 149 deletions(-) diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index cf69f6ada2..c0ab6df66d 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -272,18 +272,32 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor, for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_len_local = iov[iov_count].iov_len; - while( 1 ) { - while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { - /* now here we have a basic datatype */ - PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - conv_ptr, iov_ptr, iov_len_local ); - if( 0 == count_desc ) { /* completed */ + + if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) { + /* we have a partial (less than blocklen) basic datatype */ + int rc = PACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc, + conv_ptr, iov_ptr, iov_len_local ); + if( 0 == rc ) /* not done */ + goto complete_loop; + if( 0 == count_desc ) { conv_ptr = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); - continue; } - goto complete_loop; + } + } + + while( 1 ) { + while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + /* we have a basic datatype (working on full blocks) */ + PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, + conv_ptr, iov_ptr, iov_len_local ); + if( 0 != count_desc ) /* completed? */ + goto complete_loop; + conv_ptr = pConvertor->pBaseBuf + pStack->disp; + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); } if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ DO_DEBUG( opal_output( 0, "pack end_loop count %" PRIsize_t " stack_pos %d" diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h index 4da9bd2450..1eaf2e8b9f 100644 --- a/opal/datatype/opal_datatype_pack.h +++ b/opal/datatype/opal_datatype_pack.h @@ -26,6 +26,63 @@ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif +/** + * This function deals only with partial elements. The COUNT points however to the whole leftover count, + * but this function is only expected to operate on an amount less than blength, that would allow the rest + * of the pack process to handle only entire blength blocks (plus the left over). + * + * Return 1 if we are now aligned on a block, 0 otherwise. + */ +static inline int +pack_partial_blocklen( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** memory, + unsigned char** packed, + size_t* SPACE ) +{ + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now = *(COUNT); + unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; + + assert( *(COUNT) <= _elem->count * _elem->blocklen); + + /** + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). + */ + if( 0 == (do_now = (*COUNT) % _elem->blocklen) ) + return 1; + + size_t left_in_block = do_now; /* left in the current blocklen */ + + if( (do_now_bytes * do_now) > *(SPACE) ) + do_now = (*SPACE) / do_now_bytes; + + do_now_bytes *= do_now; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [partial]\n", + _packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); + *(memory) += (ptrdiff_t)do_now_bytes; + if( do_now == left_in_block ) /* compensate if completed a blocklen */ + *(memory) += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + + *(COUNT) -= do_now; + *(SPACE) -= do_now_bytes; + *(packed) += do_now_bytes; + return (do_now == left_in_block); +} + +/** + * Pack entire blocks, plus a possible remainder if SPACE is constrained to less than COUNT elements. + */ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, @@ -36,27 +93,24 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, { const ddt_elem_desc_t* _elem = &((ELEM)->elem); size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; - size_t cando_count = *(COUNT), do_now, do_now_bytes; + size_t cando_count = *(COUNT), do_now_bytes; unsigned char* _memory = (*memory) + _elem->disp; unsigned char* _packed = *packed; + assert( 0 == (cando_count % _elem->blocklen) ); /* no partials here */ assert( *(COUNT) <= _elem->count * _elem->blocklen); if( (blocklen_bytes * cando_count) > *(SPACE) ) cando_count = (*SPACE) / blocklen_bytes; - do_now = *(COUNT); /* save the COUNT for later */ /* premptively update the number of COUNT we will return. */ *(COUNT) -= cando_count; - if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */ - goto do_epilog; - } if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ for(; cando_count > 0; cando_count--) { OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n", (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); _packed += blocklen_bytes; @@ -65,61 +119,32 @@ pack_predefined_data( opal_convertor_t* CONVERTOR, goto update_and_return; } - blocklen_bytes *= _elem->blocklen; - if( (_elem->count * _elem->blocklen) == cando_count ) { - goto skip_prolog; - } - /** - * First check if we already did something on this element ? The COUNT is the number - * of remaining predefined types in the current elem, not how many predefined types - * should be manipulated in the current call (this number is instead reflected on the - * SPACE). - */ - do_now = do_now % _elem->blocklen; /* any partial elements ? */ + if( (1 < _elem->count) && (_elem->blocklen <= cando_count) ) { + blocklen_bytes *= _elem->blocklen; - if( 0 != do_now ) { - size_t left_in_block = do_now; /* left in the current blocklen */ - do_now = (do_now > cando_count ) ? cando_count : do_now; - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", - _packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); - _memory += (ptrdiff_t)do_now_bytes; - /* compensate if we just completed a blocklen */ - if( do_now == left_in_block ) - _memory += _elem->extent - blocklen_bytes; - _packed += do_now_bytes; - cando_count -= do_now; - } - - skip_prolog: - /* Do as many full blocklen as possible */ - for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); - MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); - _packed += blocklen_bytes; - _memory += _elem->extent; - cando_count -= _elem->blocklen; + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); } /** * As an epilog do anything left from the last blocklen. */ if( 0 != cando_count ) { - - do_epilog: - assert( cando_count < _elem->blocklen ); + assert( (cando_count < _elem->blocklen) || + ((1 == _elem->count) && (cando_count <= _elem->blocklen)) ); do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", - (void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + (void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) ); _memory += do_now_bytes; _packed += do_now_bytes; @@ -159,7 +184,15 @@ static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, *(COUNT) -= _copy_loops; } -#define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ +#define PACK_PARTIAL_BLOCKLEN( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + MEMORY, /* the source pointer (char*) */ \ + PACKED, /* the destination pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +pack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) ) + +#define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ ELEM, /* the basic element to be packed */ \ COUNT, /* the number of elements */ \ MEMORY, /* the source pointer (char*) */ \ diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index ac35a03c26..dca07796d9 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -282,6 +282,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_len_local = iov[iov_count].iov_len; + if( 0 != pConvertor->partial_length ) { size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; size_t missing_length = element_length - pConvertor->partial_length; @@ -302,34 +303,31 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, iov_len_local -= missing_length; pConvertor->partial_length = 0; /* nothing more inside */ } - while( 1 ) { - while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { - /* now here we have a basic datatype */ - UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, - iov_ptr, conv_ptr, iov_len_local ); - if( 0 == count_desc ) { /* completed */ + if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) { + /* we have a partial (less than blocklen) basic datatype */ + int rc = UNPACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc, + iov_ptr, conv_ptr, iov_len_local ); + if( 0 == rc ) /* not done */ + goto complete_loop; + if( 0 == count_desc ) { conv_ptr = pConvertor->pBaseBuf + pStack->disp; pos_desc++; /* advance to the next data */ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); - continue; } - assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); - if( 0 != iov_len_local ) { - unsigned char* temp = conv_ptr; - /* We have some partial data here. Let's copy it into the convertor - * and keep it hot until the next round. - */ - assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size ); - COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor ); - - opal_unpack_partial_datatype( pConvertor, pElem, - iov_ptr, 0, iov_len_local, - &temp ); - - pConvertor->partial_length = iov_len_local; - iov_len_local = 0; - } - goto complete_loop; + } + } + + while( 1 ) { + while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { + /* we have a basic datatype (working on full blocks) */ + UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, + iov_ptr, conv_ptr, iov_len_local ); + if( 0 != count_desc ) /* completed? */ + goto complete_loop; + conv_ptr = pConvertor->pBaseBuf + pStack->disp; + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); } if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ DO_DEBUG( opal_output( 0, "unpack end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n", @@ -337,11 +335,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, pStack->disp, (unsigned long)iov_len_local ); ); if( --(pStack->count) == 0 ) { /* end of loop */ if( 0 == pConvertor->stack_pos ) { - /* Do the same thing as when the loop is completed */ - iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ - total_unpacked += iov[iov_count].iov_len; - iov_count++; /* go to the next */ - goto complete_conversion; + /* we're done. Force the exit of the main for loop (around iovec) */ + *out_size = iov_count; + goto complete_loop; } pConvertor->stack_pos--; pStack--; @@ -380,14 +376,29 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, conv_ptr = pConvertor->pBaseBuf + pStack->disp; UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" ); - continue; } } complete_loop: + assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); + if( 0 != iov_len_local ) { + unsigned char* temp = conv_ptr; + /* We have some partial data here. Let's copy it into the convertor + * and keep it hot until the next round. + */ + assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size ); + COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor ); + + opal_unpack_partial_datatype( pConvertor, pElem, + iov_ptr, 0, iov_len_local, + &temp ); + + pConvertor->partial_length = iov_len_local; + iov_len_local = 0; + } + iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ total_unpacked += iov[iov_count].iov_len; } - complete_conversion: *max_data = total_unpacked; pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ *out_size = iov_count; @@ -514,11 +525,9 @@ opal_unpack_general_function( opal_convertor_t* pConvertor, pStack->disp, (unsigned long)iov_len_local ); ); if( --(pStack->count) == 0 ) { /* end of loop */ if( 0 == pConvertor->stack_pos ) { - /* Do the same thing as when the loop is completed */ - iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ - total_unpacked += iov[iov_count].iov_len; - iov_count++; /* go to the next */ - goto complete_conversion; + /* we're done. Force the exit of the main for loop (around iovec) */ + *out_size = iov_count; + goto complete_loop; } pConvertor->stack_pos--; pStack--; @@ -552,7 +561,6 @@ opal_unpack_general_function( opal_convertor_t* pConvertor, iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ total_unpacked += iov[iov_count].iov_len; } - complete_conversion: *max_data = total_unpacked; pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ *out_size = iov_count; diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h index 49a418ba2b..db5b58fd3c 100644 --- a/opal/datatype/opal_datatype_unpack.h +++ b/opal/datatype/opal_datatype_unpack.h @@ -26,6 +26,60 @@ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif +/** + * This function deals only with partial elements. The COUNT points however to the whole leftover count, + * but this function is only expected to operate on an amount less than blength, that would allow the rest + * of the pack process to handle only entire blength blocks (plus the left over). + * + * Return 1 if we are now aligned on a block, 0 otherwise. + */ +static inline int +unpack_partial_blocklen( opal_convertor_t* CONVERTOR, + const dt_elem_desc_t* ELEM, + size_t* COUNT, + unsigned char** packed, + unsigned char** memory, + size_t* SPACE ) +{ + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + size_t do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; + size_t do_now = (*COUNT); + unsigned char* _memory = (*memory) + _elem->disp; + unsigned char* _packed = *packed; + + assert( *(COUNT) <= (_elem->count * _elem->blocklen)); + + /** + * First check if we already did something on this element ? The COUNT is the number + * of remaining predefined types in the current elem, not how many predefined types + * should be manipulated in the current call (this number is instead reflected on the + * SPACE). + */ + if( 0 == (do_now = (*COUNT) % _elem->blocklen) ) + return 1; + + size_t left_in_block = do_now; /* left in the current blocklen */ + + if( (do_now_bytes * do_now) > *(SPACE) ) + do_now = (*SPACE) / do_now_bytes; + + do_now_bytes *= do_now; + + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [prolog]\n", + (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); + MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); + *(memory) += (ptrdiff_t)do_now_bytes; + if( do_now == left_in_block ) /* compensate if completed a blocklen */ + *(memory) += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); + + *(COUNT) -= do_now; + *(SPACE) -= do_now_bytes; + *(packed) += do_now_bytes; + return (do_now == left_in_block); +} + static inline void unpack_predefined_data( opal_convertor_t* CONVERTOR, const dt_elem_desc_t* ELEM, @@ -36,27 +90,24 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, { const ddt_elem_desc_t* _elem = &((ELEM)->elem); size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size; - size_t cando_count = (*COUNT), do_now, do_now_bytes; + size_t cando_count = (*COUNT), do_now_bytes; unsigned char* _memory = (*memory) + _elem->disp; unsigned char* _packed = *packed; + assert( 0 == (cando_count % _elem->blocklen) ); /* no partials here */ assert( *(COUNT) <= (_elem->count * _elem->blocklen)); if( (blocklen_bytes * cando_count) > *(SPACE) ) cando_count = (*SPACE) / blocklen_bytes; - do_now = *(COUNT); /* save the COUNT for later */ /* premptively update the number of COUNT we will return. */ *(COUNT) -= cando_count; - - if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */ - goto do_epilog; - } + if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */ for(; cando_count > 0; cando_count--) { OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", + DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n", (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); _packed += blocklen_bytes; @@ -65,57 +116,27 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, goto update_and_return; } - blocklen_bytes *= _elem->blocklen; - if( (_elem->count * _elem->blocklen) == cando_count ) { - goto skip_prolog; - } + if( (1 < _elem->count) && (_elem->blocklen <= cando_count) ) { + blocklen_bytes *= _elem->blocklen; - /** - * First check if we already did something on this element ? The COUNT is the number - * of remaining predefined types in the current elem, not how many predefined types - * should be manipulated in the current call (this number is instead reflected on the - * SPACE). - */ - do_now = do_now % _elem->blocklen; /* any partial elements ? */ - - if( 0 != do_now ) { - size_t left_in_block = do_now; /* left in the current blocklen */ - do_now = (do_now > cando_count ) ? cando_count : do_now; - do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; - - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", - (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); - MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) ); - _memory += (ptrdiff_t)do_now_bytes; - /* compensate if we just completed a blocklen */ - if( do_now == left_in_block ) - _memory += _elem->extent - blocklen_bytes; - _packed += do_now_bytes; - cando_count -= do_now; - } - - skip_prolog: - /* Do as many full blocklen as possible */ - for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, - (CONVERTOR)->pDesc, (CONVERTOR)->count ); - DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", - (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); - MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); - _packed += blocklen_bytes; - _memory += _elem->extent; - cando_count -= _elem->blocklen; + do { /* Do as many full blocklen as possible */ + OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf, + (CONVERTOR)->pDesc, (CONVERTOR)->count ); + DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", + (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); ); + MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) ); + _packed += blocklen_bytes; + _memory += _elem->extent; + cando_count -= _elem->blocklen; + } while (_elem->blocklen <= cando_count); } /** * As an epilog do anything left from the last blocklen. */ if( 0 != cando_count ) { - - do_epilog: - assert( cando_count < _elem->blocklen ); + assert( (cando_count < _elem->blocklen) || + ((1 == _elem->count) && (cando_count <= _elem->blocklen)) ); do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size; OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); @@ -160,8 +181,21 @@ static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR, *(COUNT) -= _copy_loops; } -#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, PACKED, MEMORY, SPACE ) \ - unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) +#define UNPACK_PARTIAL_BLOCKLEN( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + PACKED, /* the destination pointer (char*) */ \ + MEMORY, /* the source pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +unpack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) + +#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ + ELEM, /* the basic element to be packed */ \ + COUNT, /* the number of elements */ \ + PACKED, /* the destination pointer (char*) */ \ + MEMORY, /* the source pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) ) #define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, PACKED, MEMORY, SPACE ) \ unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) )