1
1

Clean and sync the pack and unpack functions.

- optimize handling of contiguous with gaps datatypes.
- fixes a performance issue for all datatypes with a count of 1.
- optimize the pack/unpack of contiguous with gaps datatype.
- optimize the case of blocklen == 1

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
Этот коммит содержится в:
George Bosilca 2019-05-20 11:43:29 -04:00
родитель 0a00b02e48
Коммит 012a004806
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 09C926752C9F09B1
8 изменённых файлов: 316 добавлений и 353 удалений

Просмотреть файл

@ -31,8 +31,8 @@
#endif /* OPAL_ENABLE_DEBUG */ #endif /* OPAL_ENABLE_DEBUG */
/* Take a new iovec (base + len) and try to merge it with what we already /* Take a new iovec (base + len) and try to merge it with what we already
* have. If we succeed return 0 and move forward, if not save it into a new * have. If we succeed return 0 and move forward, otherwise save it into a new
* iovec location. If we need to go to a new position and we reach the end * iovec location. If we need to advance position and we reach the end
* of the iovec array, return 1 to signal we did not saved the last iovec. * of the iovec array, return 1 to signal we did not saved the last iovec.
*/ */
static inline int static inline int
@ -46,7 +46,7 @@ opal_convertor_merge_iov( struct iovec* iov, uint32_t* iov_count,
return 0; return 0;
} /* cannot merge, move to the next position */ } /* cannot merge, move to the next position */
*idx = *idx + 1; *idx = *idx + 1;
if( *idx == *iov_count ) return 1; /* do not overwrite outside the iove array boundaries */ if( *idx == *iov_count ) return 1; /* do not overwrite outside the iovec array boundaries */
} }
iov[*idx].iov_base = base; iov[*idx].iov_base = base;
iov[*idx].iov_len = len; iov[*idx].iov_len = len;

Просмотреть файл

@ -51,11 +51,9 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM,
const ddt_elem_desc_t* _elem = &((ELEM)->elem); const ddt_elem_desc_t* _elem = &((ELEM)->elem);
unsigned char* _source = (SOURCE) + _elem->disp; unsigned char* _source = (SOURCE) + _elem->disp;
unsigned char* _destination = (DESTINATION) + _elem->disp; unsigned char* _destination = (DESTINATION) + _elem->disp;
size_t total_count = _elem->count * _elem->blocklen; size_t do_now = _elem->count, do_now_bytes;
size_t do_now, do_now_bytes;
assert( (COUNT) == total_count); assert( (COUNT) == (do_now * _elem->blocklen));
assert( total_count <= ((*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size) );
/* We don't a prologue and epilogue here as we are __always__ working /* We don't a prologue and epilogue here as we are __always__ working
* with full copies of the data description. * with full copies of the data description.
@ -64,21 +62,19 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM,
/** /**
* Compute how many full blocklen we need to do and do them. * Compute how many full blocklen we need to do and do them.
*/ */
do_now = _elem->count;
if( 0 != do_now ) {
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
assert( (do_now * do_now_bytes) <= (*SPACE) );
for(size_t _i = 0; _i < do_now; _i++ ) { for(size_t _i = 0; _i < do_now; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE), OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE),
(DATATYPE), (TOTAL_COUNT) ); (DATATYPE), (TOTAL_COUNT) );
DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n",
STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) ); ); STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) - _i * do_now_bytes ); );
MEM_OP( _destination, _source, do_now_bytes ); MEM_OP( _destination, _source, do_now_bytes );
_destination += _elem->extent; _destination += _elem->extent;
_source += _elem->extent; _source += _elem->extent;
*(SPACE) -= do_now_bytes;
}
(COUNT) -= total_count;
} }
*(SPACE) -= (do_now_bytes * do_now);
} }
static inline void _contiguous_loop( const dt_elem_desc_t* ELEM, static inline void _contiguous_loop( const dt_elem_desc_t* ELEM,

Просмотреть файл

@ -224,7 +224,6 @@ int32_t opal_datatype_init( void )
OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_CONTIGUOUS |
OPAL_DATATYPE_FLAG_NO_GAPS; OPAL_DATATYPE_FLAG_NO_GAPS;
datatype->desc.desc[0].elem.common.type = i; datatype->desc.desc[0].elem.common.type = i;
/* datatype->desc.desc[0].elem.blocklen XXX not set at the moment, it will be needed later */
datatype->desc.desc[0].elem.count = 1; datatype->desc.desc[0].elem.count = 1;
datatype->desc.desc[0].elem.blocklen = 1; datatype->desc.desc[0].elem.blocklen = 1;
datatype->desc.desc[0].elem.disp = 0; datatype->desc.desc[0].elem.disp = 0;

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -53,8 +53,6 @@
#endif /* defined(CHECKSUM) */ #endif /* defined(CHECKSUM) */
#define IOVEC_MEM_LIMIT 8192
/* the contig versions does not use the stack. They can easily retrieve /* the contig versions does not use the stack. They can easily retrieve
* the status with just the informations from pConvertor->bConverted. * the status with just the informations from pConvertor->bConverted.
*/ */
@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv,
unsigned char *source_base = NULL; unsigned char *source_base = NULL;
uint32_t iov_count; uint32_t iov_count;
size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted; size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted;
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp); source_base = (pConv->pBaseBuf + pConv->pDesc->true_lb + pStack[0].disp + pStack[1].disp);
/* There are some optimizations that can be done if the upper level /* There are some optimizations that can be done if the upper level
* does not provide a buffer. * does not provide a buffer.
@ -111,15 +108,18 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
uint32_t* out_size, uint32_t* out_size,
size_t* max_data ) size_t* max_data )
{ {
size_t remaining, length, initial_bytes_converted = pConv->bConverted;
const opal_datatype_t* pData = pConv->pDesc; const opal_datatype_t* pData = pConv->pDesc;
dt_stack_t* stack = pConv->pStack; dt_stack_t* stack = pConv->pStack;
ptrdiff_t extent = pData->ub - pData->lb;
unsigned char *user_memory, *packed_buffer; unsigned char *user_memory, *packed_buffer;
uint32_t iov_count, index; uint32_t idx;
size_t i; size_t i;
size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted;
ptrdiff_t extent= pData->ub - pData->lb;
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
/* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb
* is the initial displacement, the size the length of the contiguous area and the extent represent
* how much we should jump between elements.
*/
assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) ); assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) );
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n",
(void*)pConv->pBaseBuf, *out_size ); ); (void*)pConv->pBaseBuf, *out_size ); );
@ -127,139 +127,97 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
stack[1].type = opal_datatype_uint1.id; stack[1].type = opal_datatype_uint1.id;
} }
/* We can provide directly the pointers in the user buffers (like the convertor_raw) */
if( NULL == iov[0].iov_base ) {
user_memory = pConv->pBaseBuf + pData->true_lb;
/* There are some optimizations that can be done if the upper level for( idx = 0; (idx < (*out_size)) && stack[0].count; idx++ ) {
* does not provide a buffer. iov[idx].iov_base = user_memory + stack[0].disp + stack[1].disp;
*/ iov[idx].iov_len = stack[1].count;
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv );
pConv->bConverted += stack[1].count;
stack[0].disp += extent;
stack[0].count--;
stack[1].disp = 0;
stack[1].count = pData->size; /* we might need this to update the partial
* length for the first iteration */
}
goto update_status_and_return;
}
for( idx = 0; idx < (*out_size); idx++ ) {
/* Limit the amount of packed data to the data left over on this convertor */ /* Limit the amount of packed data to the data left over on this convertor */
remaining = pConv->local_size - pConv->bConverted; remaining = pConv->local_size - pConv->bConverted;
if( 0 == remaining ) break; /* we're done this time */ if( 0 == remaining ) break; /* we're done this time */
if( remaining > iov[iov_count].iov_len ) if( remaining > iov[idx].iov_len )
remaining = iov[iov_count].iov_len; remaining = iov[idx].iov_len;
packed_buffer = (unsigned char *)iov[iov_count].iov_base; packed_buffer = (unsigned char *)iov[idx].iov_base;
bConverted = remaining; /* how much will get unpacked this time */ pConv->bConverted += remaining;
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp + stack[1].disp; user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
i = pConv->count - stack[0].count; /* how many we already packed */
assert(i == (pConv->bConverted / pData->size));
if( packed_buffer == NULL ) { DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n",
/* special case for small data. We avoid allocating memory if we (void*)user_memory, (void*)packed_buffer, remaining ); );
* can fill the iovec directly with the address of the remaining
* data.
*/
if( stack->count < (size_t)((*out_size) - iov_count) ) {
stack[1].count = pData->size - (pConv->bConverted % pData->size);
for( index = iov_count; i < pConv->count; i++, index++ ) {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = stack[1].count;
stack[0].disp += extent;
pConv->bConverted += stack[1].count;
stack[1].disp = 0; /* reset it for the next round */
stack[1].count = pData->size;
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp;
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
}
*out_size = iov_count + index;
*max_data = (pConv->bConverted - initial_bytes_converted);
pConv->flags |= CONVERTOR_COMPLETED;
return 1; /* we're done */
}
/* now special case for big contiguous data with gaps around */
if( pData->size >= IOVEC_MEM_LIMIT ) {
/* as we dont have to copy any data, we can simply fill the iovecs
* with data from the user data description.
*/
for( index = iov_count; (i < pConv->count) && (index < (*out_size));
i++, index++ ) {
if( remaining < pData->size ) {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = remaining;
remaining = 0;
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
break;
} else {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = pData->size;
user_memory += extent;
COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv );
}
remaining -= iov[index].iov_len;
pConv->bConverted += iov[index].iov_len;
}
*out_size = index;
*max_data = (pConv->bConverted - initial_bytes_converted);
if( pConv->bConverted == pConv->local_size ) {
pConv->flags |= CONVERTOR_COMPLETED;
return 1;
}
return 0;
}
}
{
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); );
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */ length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */
/* data left from last round and enough space in the buffer */ /* data left from last round and enough space in the buffer */
if( (0 != length) && (length <= remaining)) { if( (pData->size != length) && (length <= remaining)) {
/* copy the partial left-over from the previous round */ /* copy the partial left-over from the previous round */
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "2. pack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [prologue]\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); (void*)user_memory, (void*)packed_buffer, length ); );
MEMCPY_CSUM( packed_buffer, user_memory, length, pConv ); MEMCPY_CSUM( packed_buffer, user_memory, length, pConv );
packed_buffer += length; packed_buffer += length;
user_memory += (extent - pData->size + length);
remaining -= length; remaining -= length;
stack[1].count -= length; stack[1].count -= length;
stack[1].disp += length; /* just in case, we overwrite this below */
if( 0 == stack[1].count) { /* one completed element */ if( 0 == stack[1].count) { /* one completed element */
stack[0].count--; stack[0].count--;
stack[0].disp += extent; stack[0].disp += extent;
if( 0 != stack[0].count ) { /* not yet done */ if( 0 == stack[0].count ) /* not yet done */
break;
stack[1].count = pData->size; stack[1].count = pData->size;
stack[1].disp = 0; stack[1].disp = 0;
} }
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
} }
}
for( i = 0; pData->size <= remaining; i++ ) { for( i = 0; pData->size <= remaining; i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "3. pack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); ); (void*)user_memory, (void*)packed_buffer, pData->size, remaining, iov[idx].iov_len ); );
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv ); MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
packed_buffer += pData->size; packed_buffer += pData->size;
user_memory += extent; user_memory += extent;
remaining -= pData->size; remaining -= pData->size;
} }
stack[0].count -= i; /* the filled up and the entire types */ stack[0].count -= i; /* the entire datatype copied above */
stack[0].disp += (i * extent); stack[0].disp += (i * extent);
stack[1].disp += remaining;
/* Copy the last bits */ /* Copy the last bits */
if( 0 != remaining ) { if( 0 != remaining ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %" PRIsize_t "\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); (void*)user_memory, (void*)packed_buffer, remaining ); );
MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv ); MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv );
user_memory += remaining;
stack[1].count -= remaining; stack[1].count -= remaining;
} stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */
if( 0 == stack[1].count ) { /* prepare for the next element */ if( 0 == stack[1].count ) { /* prepare for the next element */
stack[1].count = pData->size; stack[1].count = pData->size;
stack[1].disp = 0; stack[1].disp = 0;
} }
} }
pConv->bConverted += bConverted;
} }
*out_size = iov_count;
*max_data = (pConv->bConverted - initial_bytes_converted); update_status_and_return:
if( pConv->bConverted == pConv->local_size ) { *out_size = idx;
pConv->flags |= CONVERTOR_COMPLETED; *max_data = pConv->bConverted - initial_bytes_converted;
return 1; if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED;
} return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */
return 0;
} }
/* The pack/unpack functions need a cleanup. I have to create a proper interface to access /* The pack/unpack functions need a cleanup. I have to create a proper interface to access

Просмотреть файл

@ -35,82 +35,90 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
size_t* SPACE ) size_t* SPACE )
{ {
const ddt_elem_desc_t* _elem = &((ELEM)->elem); const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t total_count = _elem->count * _elem->blocklen;
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now, do_now_bytes; size_t do_now, do_now_bytes;
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
unsigned char* _memory = (*memory) + _elem->disp; unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;
assert( *(COUNT) <= _elem->count * _elem->blocklen); assert( *(COUNT) <= _elem->count * _elem->blocklen);
if( cando_count > *(COUNT) ) if( cando_count > *(COUNT) )
cando_count = *(COUNT); cando_count = *(COUNT);
/** if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
* First check if we already did something on this element ? *(COUNT) -= cando_count;
*/ for(; cando_count > 0; cando_count--) {
do_now = (total_count - *(COUNT)); /* done elements */ OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
if( 0 != do_now ) { (CONVERTOR)->pDesc, (CONVERTOR)->count );
do_now = do_now % _elem->blocklen; /* partial blocklen? */ DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) );
_packed += blocklen_bytes;
_memory += _elem->extent;
}
goto update_and_return;
}
blocklen_bytes *= _elem->blocklen;
/**
* First check if we already did something on this element ? The COUNT is the number
* of remaining predefined types in the current elem, not how many predefined types
* should be manipulated in the current call (this number is instead reflected on the
* SPACE).
*/
do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
if( 0 != do_now ) { if( 0 != do_now ) {
size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ size_t left_in_block = do_now; /* left in the current blocklen */
do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; do_now = (do_now > cando_count ) ? cando_count : do_now;
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n",
(void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); _packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) );
_memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; _memory += (ptrdiff_t)do_now_bytes;
/* compensate if we just completed a blocklen */ /* compensate if we just completed a blocklen */
if( do_now == left_in_block ) if( do_now == left_in_block )
_memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); _memory += _elem->extent - blocklen_bytes;
*(packed) += do_now_bytes; _packed += do_now_bytes;
*(SPACE) -= do_now_bytes;
*(COUNT) -= do_now;
cando_count -= do_now; cando_count -= do_now;
} }
}
/** /* Do as many full blocklen as possible */
* Compute how many full blocklen we need to do and do them. for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
*/ OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
do_now = cando_count / _elem->blocklen;
if( 0 != do_now ) {
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
for(size_t _i = 0; _i < do_now; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) );
*(packed) += do_now_bytes; _packed += blocklen_bytes;
_memory += _elem->extent; _memory += _elem->extent;
*(SPACE) -= do_now_bytes;
*(COUNT) -= _elem->blocklen;
cando_count -= _elem->blocklen; cando_count -= _elem->blocklen;
} }
}
/** /**
* As an epilog do anything left from the last blocklen. * As an epilog do anything left from the last blocklen.
*/ */
do_now = cando_count; if( 0 != cando_count ) {
if( 0 != do_now ) { assert( cando_count < _elem->blocklen );
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n",
(void*)*(packed), (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); (void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
MEMCPY_CSUM( *(packed), _memory, do_now_bytes, (CONVERTOR) ); MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) );
_memory += do_now_bytes; _memory += do_now_bytes;
*(packed) += do_now_bytes; _packed += do_now_bytes;
*(SPACE) -= do_now_bytes;
*(COUNT) -= do_now;
} }
update_and_return:
*(memory) = _memory - _elem->disp; *(memory) = _memory - _elem->disp;
*(SPACE) -= (_packed - *packed);
*(packed) = _packed;
} }
static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR,

Просмотреть файл

@ -49,10 +49,24 @@
* - the DT_CONTIGUOUS flag for the type OPAL_DATATYPE_END_LOOP is meaningless. * - the DT_CONTIGUOUS flag for the type OPAL_DATATYPE_END_LOOP is meaningless.
*/ */
static inline void
position_single_block(opal_convertor_t* CONVERTOR,
unsigned char** mem, ptrdiff_t mem_update,
size_t* space, size_t space_update,
size_t* cnt, size_t cnt_update)
{
OPAL_DATATYPE_SAFEGUARD_POINTER( *mem, mem_update, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n",
(void*)*mem, (unsigned long)space_update, (unsigned long)(*space) ); );
*mem += mem_update;
*space -= space_update;
*cnt -= cnt_update;
}
/** /**
* Advance the current position in the convertor based using the * Advance the convertors' position according. Update the pointer and the remaining space
* current element and a left-over counter. Update the head pointer * accordingly.
* and the leftover byte space.
*/ */
static inline void static inline void
position_predefined_data( opal_convertor_t* CONVERTOR, position_predefined_data( opal_convertor_t* CONVERTOR,
@ -64,7 +78,7 @@ position_predefined_data( opal_convertor_t* CONVERTOR,
const ddt_elem_desc_t* _elem = &((ELEM)->elem); const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t total_count = _elem->count * _elem->blocklen; size_t total_count = _elem->count * _elem->blocklen;
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now, do_now_bytes; size_t do_now, do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
unsigned char* _memory = (*POINTER) + _elem->disp; unsigned char* _memory = (*POINTER) + _elem->disp;
assert( *(COUNT) <= _elem->count * _elem->blocklen); assert( *(COUNT) <= _elem->count * _elem->blocklen);
@ -72,6 +86,15 @@ position_predefined_data( opal_convertor_t* CONVERTOR,
if( cando_count > *(COUNT) ) if( cando_count > *(COUNT) )
cando_count = *(COUNT); cando_count = *(COUNT);
if( 1 == _elem->blocklen ) {
DO_DEBUG( opal_output( 0, "position( %p, %" PRIsize_t " ) x (count %" PRIsize_t ", extent %ld) => space %lu [prolog]\n",
(void*)_memory, (unsigned long)do_now_bytes, cando_count, _elem->extent, (unsigned long)(*SPACE) ); );
_memory += cando_count * _elem->extent;
*SPACE -= cando_count * do_now_bytes;
*COUNT -= cando_count;
goto update_and_return;
}
/** /**
* First check if we already did something on this element ? * First check if we already did something on this element ?
*/ */
@ -84,16 +107,12 @@ position_predefined_data( opal_convertor_t* CONVERTOR,
do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; do_now = (left_in_block > cando_count ) ? cando_count : left_in_block;
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, position_single_block( CONVERTOR, &_memory, do_now_bytes,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); SPACE, do_now_bytes, COUNT, do_now );
DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n",
(void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
_memory = *(POINTER) + _elem->disp + (ptrdiff_t)do_now_bytes;
/* compensate if we just completed a blocklen */ /* compensate if we just completed a blocklen */
if( do_now == left_in_block ) if( do_now == left_in_block )
_memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); _memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size);
*(SPACE) -= do_now_bytes;
*(COUNT) -= do_now;
cando_count -= do_now; cando_count -= do_now;
} }
} }
@ -105,13 +124,8 @@ position_predefined_data( opal_convertor_t* CONVERTOR,
if( 0 != do_now ) { if( 0 != do_now ) {
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size; do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
for(size_t _i = 0; _i < do_now; _i++ ) { for(size_t _i = 0; _i < do_now; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, position_single_block( CONVERTOR, &_memory, _elem->extent,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); SPACE, do_now_bytes, COUNT, _elem->blocklen );
DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu\n",
(void*)_memory, (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); );
_memory += _elem->extent;
*(SPACE) -= do_now_bytes;
*(COUNT) -= _elem->blocklen;
cando_count -= _elem->blocklen; cando_count -= _elem->blocklen;
} }
} }
@ -122,15 +136,11 @@ position_predefined_data( opal_convertor_t* CONVERTOR,
do_now = cando_count; do_now = cando_count;
if( 0 != do_now ) { if( 0 != do_now ) {
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, position_single_block( CONVERTOR, &_memory, do_now_bytes,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); SPACE, do_now_bytes, COUNT, do_now );
DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [epilog]\n",
(void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
_memory += do_now_bytes;
*(SPACE) -= do_now_bytes;
*(COUNT) -= do_now;
} }
update_and_return:
*(POINTER) = _memory - _elem->disp; *(POINTER) = _memory - _elem->disp;
} }

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -70,98 +70,82 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
{ {
const opal_datatype_t *pData = pConv->pDesc; const opal_datatype_t *pData = pConv->pDesc;
unsigned char *user_memory, *packed_buffer; unsigned char *user_memory, *packed_buffer;
uint32_t iov_count, i; uint32_t iov_idx, i;
size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted; size_t remaining, initial_bytes_converted = pConv->bConverted;
dt_stack_t* stack = pConv->pStack; dt_stack_t* stack = pConv->pStack;
ptrdiff_t extent = pData->ub - pData->lb; ptrdiff_t extent = pData->ub - pData->lb;
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov count %d )\n",
(void*)pConv->pBaseBuf, *out_size ); ); (void*)pConv->pBaseBuf, *out_size ); );
if( stack[1].type != opal_datatype_uint1.id ) { if( stack[1].type != opal_datatype_uint1.id ) {
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
stack[1].type = opal_datatype_uint1.id; stack[1].type = opal_datatype_uint1.id;
} }
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
remaining = pConv->local_size - pConv->bConverted;
if( 0 == remaining ) break; /* we're done this time */
if( remaining > iov[iov_count].iov_len )
remaining = iov[iov_count].iov_len;
packed_buffer = (unsigned char*)iov[iov_count].iov_base;
bConverted = remaining; /* how much will get unpacked this time */
user_memory = pConv->pBaseBuf + initial_displ;
if( (ptrdiff_t)pData->size == extent ) { if( (ptrdiff_t)pData->size == extent ) {
user_memory += pConv->bConverted; for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) {
DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", remaining = pConv->local_size - pConv->bConverted;
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); if( 0 == remaining ) break; /* we're done this time */
if( remaining > iov[iov_idx].iov_len )
remaining = iov[iov_idx].iov_len;
packed_buffer = (unsigned char*)iov[iov_idx].iov_base;
user_memory = pConv->pBaseBuf + pData->true_lb + pConv->bConverted;
/* contiguous data or basic datatype with count */ /* contiguous data or basic datatype with count */
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining,
pConv->pBaseBuf, pData, pConv->count ); pConv->pBaseBuf, pData, pConv->count );
DO_DEBUG( opal_output( 0, "1. unpack contig dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "unpack contig [%d] dest %p src %p length %" PRIsize_t "\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); );
MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv );
pConv->bConverted += remaining; /* how much will get unpacked this time */
}
} else { } else {
user_memory += stack[0].disp + stack[1].disp; for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) {
remaining = pConv->local_size - pConv->bConverted;
if( 0 == remaining ) break; /* we're done this time */
if( remaining > iov[iov_idx].iov_len )
remaining = iov[iov_idx].iov_len;
DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", packed_buffer = (unsigned char*)iov[iov_idx].iov_base;
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
pConv->bConverted += remaining; /* how much will get unpacked this time */
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last unpack */ for( i = 0; stack[1].count <= remaining; i++ ) { /* partial or full data */
/* complete the last copy */ OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, stack[1].count, pConv->pBaseBuf,
if( (0 != length) && (length <= remaining) ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "2. unpack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [%d]\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); iov_idx, (void*)user_memory, (void*)packed_buffer, stack[1].count, i ); );
MEMCPY_CSUM( user_memory, packed_buffer, length, pConv ); MEMCPY_CSUM( user_memory, packed_buffer, stack[1].count, pConv );
packed_buffer += length;
user_memory += (extent - (pData->size - length)); packed_buffer += stack[1].count;
remaining -= length; remaining -= stack[1].count;
stack[1].count -= length;
if( 0 == stack[1].count) { /* one completed element */
stack[0].count--; stack[0].count--;
stack[0].disp += extent; stack[0].disp += extent;
if( 0 != stack[0].count ) { /* not yet done */
stack[1].count = pData->size; stack[1].count = pData->size;
stack[1].disp = 0; stack[1].disp = 0;
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp;
} }
}
} /* Copy the last bits */
for( i = 0; pData->size <= remaining; i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "3. unpack dest %p src %p length %lu\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); );
MEMCPY_CSUM( user_memory, packed_buffer, pData->size, pConv );
packed_buffer += pData->size;
user_memory += extent;
remaining -= pData->size;
}
stack[0].count -= i;
stack[0].disp += (i * extent);
stack[1].disp += remaining;
/* copy the last bits */
if( 0 != remaining ) { if( 0 != remaining ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "4. unpack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [epilog]\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); );
MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv );
user_memory += remaining;
stack[1].count -= remaining; stack[1].count -= remaining;
stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */
assert( stack[1].count );
} }
} }
pConv->bConverted += bConverted;
} }
*out_size = iov_count; /* we only reach this line after the for loop succesfully complete */ *out_size = iov_idx; /* we only reach this line after the for loop succesfully complete */
*max_data = (pConv->bConverted - initial_bytes_converted); *max_data = pConv->bConverted - initial_bytes_converted;
if( pConv->bConverted == pConv->local_size ) { if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED;
pConv->flags |= CONVERTOR_COMPLETED; return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */
return 1;
}
return 0;
} }
/** /**
@ -179,7 +163,7 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
static inline void static inline void
opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pElem, opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pElem,
unsigned char* partial_data, unsigned char* partial_data,
ptrdiff_t start_position, ptrdiff_t length, ptrdiff_t start_position, size_t length,
unsigned char** user_buffer ) unsigned char** user_buffer )
{ {
char unused_byte = 0x7F, saved_data[16]; char unused_byte = 0x7F, saved_data[16];
@ -195,7 +179,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
/* Find a byte that is not used in the partial buffer */ /* Find a byte that is not used in the partial buffer */
find_unused_byte: find_unused_byte:
for(ptrdiff_t i = 0; i < length; i++ ) { for(size_t i = 0; i < length; i++ ) {
if( unused_byte == partial_data[i] ) { if( unused_byte == partial_data[i] ) {
unused_byte--; unused_byte--;
goto find_unused_byte; goto find_unused_byte;
@ -306,7 +290,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
COMPUTE_CSUM( iov_ptr, missing_length, pConvertor ); COMPUTE_CSUM( iov_ptr, missing_length, pConvertor );
opal_unpack_partial_datatype( pConvertor, pElem, opal_unpack_partial_datatype( pConvertor, pElem,
iov_ptr, iov_ptr,
pConvertor->partial_length, element_length - pConvertor->partial_length, pConvertor->partial_length, (size_t)(element_length - pConvertor->partial_length),
&conv_ptr ); &conv_ptr );
--count_desc; --count_desc;
if( 0 == count_desc ) { if( 0 == count_desc ) {

Просмотреть файл

@ -35,82 +35,90 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
size_t* SPACE ) size_t* SPACE )
{ {
const ddt_elem_desc_t* _elem = &((ELEM)->elem); const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t total_count = _elem->count * _elem->blocklen;
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size; size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now, do_now_bytes; size_t do_now, do_now_bytes;
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
unsigned char* _memory = (*memory) + _elem->disp; unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;
assert( *(COUNT) <= _elem->count * _elem->blocklen); assert( *(COUNT) <= (_elem->count * _elem->blocklen));
if( cando_count > *(COUNT) ) if( cando_count > *(COUNT) )
cando_count = *(COUNT); cando_count = *(COUNT);
/** if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
* First check if we already did something on this element ? *(COUNT) -= cando_count;
*/ for(; cando_count > 0; cando_count--) {
do_now = (total_count - *(COUNT)); /* done elements */ OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
if( 0 != do_now ) { (CONVERTOR)->pDesc, (CONVERTOR)->count );
do_now = do_now % _elem->blocklen; /* partial blocklen? */ DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) );
_packed += blocklen_bytes;
_memory += _elem->extent;
}
goto update_and_return;
}
blocklen_bytes *= _elem->blocklen;
/**
* First check if we already did something on this element ? The COUNT is the number
* of remaining predefined types in the current elem, not how many predefined types
* should be manipulated in the current call (this number is instead reflected on the
* SPACE).
*/
do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
if( 0 != do_now ) { if( 0 != do_now ) {
size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */ size_t left_in_block = do_now; /* left in the current blocklen */
do_now = (left_in_block > cando_count ) ? cando_count : left_in_block; do_now = (do_now > cando_count ) ? cando_count : do_now;
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n", DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu [prolog]\n",
(void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) );
_memory = (*memory) + _elem->disp + (ptrdiff_t)do_now_bytes; _memory += (ptrdiff_t)do_now_bytes;
/* compensate if we just completed a blocklen */ /* compensate if we just completed a blocklen */
if( do_now == left_in_block ) if( do_now == left_in_block )
_memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size); _memory += _elem->extent - blocklen_bytes;
*(packed) += do_now_bytes; _packed += do_now_bytes;
*(SPACE) -= do_now_bytes;
*(COUNT) -= do_now;
cando_count -= do_now; cando_count -= do_now;
} }
}
/** /* Do as many full blocklen as possible */
* Compute how many full blocklen we need to do and do them. for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
*/ OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
do_now = cando_count / _elem->blocklen;
if( 0 != do_now ) {
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
for(size_t _i = 0; _i < do_now; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)*(SPACE) ); ); (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) );
*(packed) += do_now_bytes; _packed += blocklen_bytes;
_memory += _elem->extent; _memory += _elem->extent;
*(SPACE) -= do_now_bytes;
*(COUNT) -= _elem->blocklen;
cando_count -= _elem->blocklen; cando_count -= _elem->blocklen;
} }
}
/** /**
* As an epilog do anything left from the last blocklen. * As an epilog do anything left from the last blocklen.
*/ */
do_now = cando_count; if( 0 != cando_count ) {
if( 0 != do_now ) { assert( cando_count < _elem->blocklen );
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size; do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n", DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n",
(void*)_memory, (void*)*(packed), (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); ); (void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( _memory, *(packed), do_now_bytes, (CONVERTOR) ); MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) );
_memory += do_now_bytes; _memory += do_now_bytes;
*(packed) += do_now_bytes; _packed += do_now_bytes;
*(SPACE) -= do_now_bytes;
*(COUNT) -= do_now;
} }
update_and_return:
*(memory) = _memory - _elem->disp; *(memory) = _memory - _elem->disp;
*(SPACE) -= (_packed - *packed);
*(packed) = _packed;
} }
static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR, static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR,