1
1

Refactor the datatype engine to reduce the critical path for the most

trivial (and thus used) datatype usages. Make the gaps_contiguous
pack and unpack functions similar.
Этот коммит содержится в:
George Bosilca 2014-12-02 16:23:41 +09:00
родитель 8ee501350b
Коммит e640673372
6 изменённых файлов: 144 добавлений и 116 удалений

Просмотреть файл

@ -362,13 +362,12 @@ static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t*
if( OPAL_LIKELY(0 == count) ) {
pStack[1].type = pElems->elem.common.type;
pStack[1].count = pElems->elem.count;
pStack[1].disp = 0;
} else {
pStack[1].type = OPAL_DATATYPE_UINT1;
pStack[1].count = pData->size - count;
pStack[1].disp = count;
}
pStack[1].index = 0; /* useless */
pStack[1].disp = count;
pStack[1].index = 0; /* useless */
pConvertor->bConverted = starting_point;
pConvertor->stack_pos = 1;
@ -400,13 +399,16 @@ int opal_convertor_create_stack_at_begining( opal_convertor_t* convertor,
pStack[0].index = -1;
pStack[0].count = convertor->count;
pStack[0].disp = 0;
pStack[0].type = OPAL_DATATYPE_LOOP;
pStack[1].index = 0;
pStack[1].disp = 0;
if( pElems[0].elem.common.type == OPAL_DATATYPE_LOOP ) {
pStack[1].count = pElems[0].loop.loops;
pStack[1].type = OPAL_DATATYPE_LOOP;
} else {
pStack[1].count = pElems[0].elem.count;
pStack[1].type = pElems[0].elem.common.type;
}
return OPAL_SUCCESS;
}

Просмотреть файл

@ -108,49 +108,55 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
size_t* max_data )
{
const opal_datatype_t* pData = pConv->pDesc;
dt_stack_t* pStack = pConv->pStack;
dt_stack_t* stack = pConv->pStack;
unsigned char *user_memory, *packed_buffer;
uint32_t i, index, iov_count;
size_t max_allowed, total_bytes_converted = 0;
OPAL_PTRDIFF_TYPE extent;
size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted;
OPAL_PTRDIFF_TYPE extent= pData->ub - pData->lb;
OPAL_PTRDIFF_TYPE initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
extent = pData->ub - pData->lb;
assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((OPAL_PTRDIFF_TYPE)pData->size != extent) );
/* Limit the amount of packed data to the data left over on this convertor */
max_allowed = pConv->local_size - pConv->bConverted;
if( max_allowed > (*max_data) )
max_allowed = (*max_data);
i = (uint32_t)(pConv->bConverted / pData->size); /* how many we already pack */
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n",
pConv->pBaseBuf, *out_size ); );
if( stack[1].type != opal_datatype_uint1.id ) {
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
stack[1].type = opal_datatype_uint1.id;
}
/* There are some optimizations that can be done if the upper level
* does not provide a buffer.
*/
user_memory = pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp;
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
if( 0 == max_allowed ) break; /* we're done this time */
if( iov[iov_count].iov_base == NULL ) {
/* Limit the amount of packed data to the data left over on this convertor */
remaining = pConv->local_size - pConv->bConverted;
if( 0 == remaining ) break; /* we're done this time */
if( remaining > (uint32_t)iov[iov_count].iov_len )
remaining = iov[iov_count].iov_len;
packed_buffer = (unsigned char *)iov[iov_count].iov_base;
bConverted = remaining; /* how much will get unpacked this time */
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp + stack[1].disp;
i = pConv->count - stack[0].count; /* how many we already packed */
assert(i == ((uint32_t)(pConv->bConverted / pData->size)));
if( packed_buffer == NULL ) {
/* special case for small data. We avoid allocating memory if we
* can fill the iovec directly with the address of the remaining
* data.
*/
if( (uint32_t)pStack->count < ((*out_size) - iov_count) ) {
pStack[1].count = pData->size - (pConv->bConverted % pData->size);
if( (uint32_t)stack->count < ((*out_size) - iov_count) ) {
stack[1].count = pData->size - (pConv->bConverted % pData->size);
for( index = iov_count; i < pConv->count; i++, index++ ) {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = pStack[1].count;
pStack[0].disp += extent;
total_bytes_converted += pStack[1].count;
pStack[1].disp = 0; /* reset it for the next round */
pStack[1].count = pData->size;
user_memory = pConv->pBaseBuf + initial_displ + pStack[0].disp;
iov[index].iov_len = stack[1].count;
stack[0].disp += extent;
pConv->bConverted += stack[1].count;
stack[1].disp = 0; /* reset it for the next round */
stack[1].count = pData->size;
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp;
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
}
*out_size = iov_count + index;
pConv->bConverted += total_bytes_converted;
*max_data = total_bytes_converted;
*max_data = (pConv->bConverted - initial_bytes_converted);
pConv->flags |= CONVERTOR_COMPLETED;
return 1; /* we're done */
}
@ -161,10 +167,10 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
*/
for( index = iov_count; (i < pConv->count) && (index < (*out_size));
i++, index++ ) {
if( max_allowed < pData->size ) {
if( remaining < pData->size ) {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = max_allowed;
max_allowed = 0;
iov[index].iov_len = remaining;
remaining = 0;
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
break;
} else {
@ -173,12 +179,11 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
user_memory += extent;
COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv );
}
max_allowed -= iov[index].iov_len;
total_bytes_converted += iov[index].iov_len;
remaining -= iov[index].iov_len;
pConv->bConverted += iov[index].iov_len;
}
*out_size = index;
*max_data = total_bytes_converted;
pConv->bConverted += total_bytes_converted;
*max_data = (pConv->bConverted - initial_bytes_converted);
if( pConv->bConverted == pConv->local_size ) {
pConv->flags |= CONVERTOR_COMPLETED;
return 1;
@ -188,52 +193,63 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
}
{
uint32_t counter;
size_t done;
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n",
user_memory, packed_buffer, (unsigned long)remaining ); );
packed_buffer = (unsigned char *) iov[iov_count].iov_base;
done = pConv->bConverted - i * pData->size; /* partial data from last pack */
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */
/* data left from last round and enough space in the buffer */
if( (done + max_allowed) >= pData->size ) {
if( (0 != length) && (length <= remaining)) {
/* copy the partial left-over from the previous round */
done = pData->size - done;
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, done, pConv->pBaseBuf, pData, pConv->count );
MEMCPY_CSUM( packed_buffer, user_memory, done, pConv );
packed_buffer += done;
max_allowed -= done;
total_bytes_converted += done;
user_memory += (extent - pData->size + done);
/* copy entire types */
counter = (uint32_t)(max_allowed / pData->size);
if( counter > pConv->count ) counter = pConv->count;
for( i = 0; i < counter; i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, pData, pConv->count );
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
packed_buffer+= pData->size;
user_memory += extent;
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "2. pack dest %p src %p length %lu\n",
user_memory, packed_buffer, (unsigned long)length ); );
MEMCPY_CSUM( packed_buffer, user_memory, length, pConv );
packed_buffer += length;
user_memory += (extent - pData->size + length);
remaining -= length;
stack[1].count -= length;
if( 0 == stack[1].count) { /* one completed element */
stack[0].count--;
stack[0].disp += extent;
if( 0 != stack[0].count ) { /* not yet done */
stack[1].count = pData->size;
stack[1].disp = 0;
}
}
done = (counter * pData->size);
max_allowed -= done;
total_bytes_converted += done;
}
/* If there is anything pending ... */
if( 0 != max_allowed ) {
done = max_allowed;
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, done, pConv->pBaseBuf, pData, pConv->count );
MEMCPY_CSUM( packed_buffer, user_memory, done, pConv );
packed_buffer += done;
max_allowed = 0;
total_bytes_converted += done;
user_memory += done;
for( i = 0; pData->size <= remaining; i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "3. pack dest %p src %p length %lu\n",
user_memory, packed_buffer, (unsigned long)pData->size ); );
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
packed_buffer += pData->size;
user_memory += extent;
remaining -= pData->size;
}
stack[0].count -= i; /* the filled up and the entire types */
stack[0].disp += (i * extent);
stack[1].disp += remaining;
/* Copy the last bits */
if( 0 != remaining ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %lu\n",
user_memory, packed_buffer, (unsigned long)remaining ); );
MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv );
user_memory += remaining;
stack[1].count -= remaining;
}
if( 0 == stack[1].count ) { /* prepare for the next element */
stack[1].count = pData->size;
stack[1].disp = 0;
}
}
pConv->bConverted += bConverted;
}
pStack[0].disp = (intptr_t)user_memory - (intptr_t)pConv->pBaseBuf - initial_displ;
pStack[1].disp = max_allowed;
*max_data = total_bytes_converted;
pConv->bConverted += total_bytes_converted;
*out_size = iov_count;
*max_data = (pConv->bConverted - initial_bytes_converted);
if( pConv->bConverted == pConv->local_size ) {
pConv->flags |= CONVERTOR_COMPLETED;
return 1;
@ -371,7 +387,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
return 1;
}
/* Save the global position for the next round */
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
conv_ptr - pConvertor->pBaseBuf );
DO_DEBUG( opal_output( 0, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );

Просмотреть файл

@ -27,7 +27,7 @@
#endif
static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
dt_elem_desc_t* ELEM,
const dt_elem_desc_t* ELEM,
uint32_t* COUNT,
unsigned char** SOURCE,
unsigned char** DESTINATION,
@ -35,7 +35,7 @@ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
{
uint32_t _copy_count = *(COUNT);
size_t _copy_blength;
ddt_elem_desc_t* _elem = &((ELEM)->elem);
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
unsigned char* _source = (*SOURCE) + _elem->disp;
_copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size;
@ -73,14 +73,14 @@ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
}
static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR,
dt_elem_desc_t* ELEM,
const dt_elem_desc_t* ELEM,
uint32_t* COUNT,
unsigned char** SOURCE,
unsigned char** DESTINATION,
size_t* SPACE )
{
ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
uint32_t _copy_loops = *(COUNT);
uint32_t _i;

Просмотреть файл

@ -109,9 +109,8 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
dt_stack_t* pStack; /* pointer to the position on the stack */
uint32_t pos_desc; /* actual position in the description of the derived datatype */
uint32_t count_desc; /* the number of items already done in the actual pos_desc */
uint16_t type; /* type at current position */
dt_elem_desc_t* description = pConvertor->use_desc->desc;
dt_elem_desc_t* pElem;
dt_elem_desc_t* pElem; /* current position */
unsigned char *base_pointer = pConvertor->pBaseBuf;
size_t iov_len_local;
OPAL_PTRDIFF_TYPE extent = pConvertor->pDesc->ub - pConvertor->pDesc->lb;
@ -133,8 +132,8 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
(unsigned long)pConvertor->bConverted, (unsigned long)*position, (unsigned long)pConvertor->pDesc->size,
(unsigned long)iov_len_local, count_desc ); );
/* Update all the stack including the last one */
for( type = 0; type <= pConvertor->stack_pos; type++ )
pStack[type].disp += count_desc * extent;
for( pos_desc = 0; pos_desc <= pConvertor->stack_pos; pos_desc++ )
pStack[pos_desc].disp += count_desc * extent;
pConvertor->bConverted += count_desc * pConvertor->pDesc->size;
iov_len_local = *position - pConvertor->bConverted;
pStack[0].count -= count_desc;
@ -228,7 +227,6 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
POSITION_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
base_pointer, iov_len_local );
if( 0 != count_desc ) { /* completed */
type = pElem->elem.common.type;
pConvertor->partial_length = (uint32_t)iov_len_local;
goto complete_loop;
}
@ -245,7 +243,7 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
if( !(pConvertor->flags & CONVERTOR_COMPLETED) ) {
/* I complete an element, next step I should go to the next one */
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_UINT1, count_desc,
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
base_pointer - pConvertor->pBaseBuf );
DO_DEBUG( opal_output( 0, "position save stack stack_pos %d pos_desc %d count_desc %d disp %llx\n",
pConvertor->stack_pos, pStack->index, (int)pStack->count, (unsigned long long)pStack->disp ); );

Просмотреть файл

@ -76,11 +76,16 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n",
pConv->pBaseBuf, *out_size ); );
if( stack[1].type != opal_datatype_uint1.id ) {
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
stack[1].type = opal_datatype_uint1.id;
}
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
packed_buffer = (unsigned char*)iov[iov_count].iov_base;
remaining = pConv->local_size - pConv->bConverted;
if( 0 == remaining ) break; /* we're done this time */
if( remaining > (uint32_t)iov[iov_count].iov_len )
remaining = iov[iov_count].iov_len;
packed_buffer = (unsigned char*)iov[iov_count].iov_base;
bConverted = remaining; /* how much will get unpacked this time */
user_memory = pConv->pBaseBuf + initial_displ;
@ -91,7 +96,7 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
/* contiguous data or basic datatype with count */
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining,
pConv->pBaseBuf, pData, pConv->count );
pConv->pBaseBuf, pData, pConv->count );
DO_DEBUG( opal_output( 0, "1. unpack contig dest %p src %p length %lu\n",
user_memory, packed_buffer, (unsigned long)remaining ); );
MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv );
@ -101,26 +106,30 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n",
user_memory, packed_buffer, (unsigned long)remaining ); );
length = pConv->bConverted / pData->size; /* already done */
length = pConv->bConverted - length * pData->size; /* how much of the last data we convert */
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last unpack */
/* complete the last copy */
if( length != 0 ) {
length = pData->size - length;
if( length <= remaining ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "2. unpack dest %p src %p length %lu\n",
user_memory, packed_buffer, (unsigned long)length ); );
MEMCPY_CSUM( user_memory, packed_buffer, length, pConv );
packed_buffer += length;
user_memory += (extent - (pData->size - length));
remaining -= length;
if( (0 != length) && (length <= remaining) ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "2. unpack dest %p src %p length %lu\n",
user_memory, packed_buffer, (unsigned long)length ); );
MEMCPY_CSUM( user_memory, packed_buffer, length, pConv );
packed_buffer += length;
user_memory += (extent - (pData->size - length));
remaining -= length;
stack[1].count -= length;
if( 0 == stack[1].count) { /* one completed element */
stack[0].count--;
stack[0].disp += extent;
if( 0 != stack[0].count ) { /* not yet done */
stack[1].count = pData->size;
stack[1].disp = 0;
}
}
}
for( i = 0; pData->size <= remaining; i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
pData, pConv->count );
pData, pConv->count );
DO_DEBUG( opal_output( 0, "3. unpack dest %p src %p length %lu\n",
user_memory, packed_buffer, (unsigned long)pData->size ); );
MEMCPY_CSUM( user_memory, packed_buffer, pData->size, pConv );
@ -128,16 +137,18 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
user_memory += extent;
remaining -= pData->size;
}
stack[0].disp = (intptr_t)user_memory - (intptr_t)pConv->pBaseBuf - initial_displ;
stack[1].disp = remaining;
stack[0].count -= i;
stack[0].disp += (i * extent);
stack[1].disp += remaining;
/* copy the last bits */
if( remaining != 0 ) {
if( 0 != remaining ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
pData, pConv->count );
pData, pConv->count );
DO_DEBUG( opal_output( 0, "4. unpack dest %p src %p length %lu\n",
user_memory, packed_buffer, (unsigned long)remaining ); );
MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv );
user_memory += remaining;
stack[1].count -= remaining;
}
}
pConv->bConverted += bConverted;
@ -400,7 +411,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
return 1;
}
/* Save the global position for the next round */
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_UINT1, count_desc,
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
conv_ptr - pConvertor->pBaseBuf );
DO_DEBUG( opal_output( 0, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
@ -564,7 +575,7 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
return 1;
}
/* Save the global position for the next round */
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_UINT1, count_desc,
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
conv_ptr - pConvertor->pBaseBuf );
DO_DEBUG( opal_output( 0, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );

Просмотреть файл

@ -24,16 +24,17 @@
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif
static inline void unpack_predefined_data( opal_convertor_t* CONVERTOR, /* the convertor */
dt_elem_desc_t* ELEM, /* the element description */
uint32_t* COUNT, /* the number of elements */
unsigned char** SOURCE, /* the source pointer */
unsigned char** DESTINATION, /* the destination pointer */
size_t* SPACE ) /* the space in the destination buffer */
static inline void
unpack_predefined_data( opal_convertor_t* CONVERTOR, /* the convertor */
const dt_elem_desc_t* ELEM, /* the element description */
uint32_t* COUNT, /* the number of elements */
unsigned char** SOURCE, /* the source pointer */
unsigned char** DESTINATION, /* the destination pointer */
size_t* SPACE ) /* the space in the destination buffer */
{
uint32_t _copy_count = *(COUNT);
size_t _copy_blength;
ddt_elem_desc_t* _elem = &((ELEM)->elem);
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
unsigned char* _destination = (*DESTINATION) + _elem->disp;
_copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size;
@ -71,14 +72,14 @@ static inline void unpack_predefined_data( opal_convertor_t* CONVERTOR, /* the c
}
static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR,
dt_elem_desc_t* ELEM,
const dt_elem_desc_t* ELEM,
uint32_t* COUNT,
unsigned char** SOURCE,
unsigned char** DESTINATION,
size_t* SPACE )
{
ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
uint32_t _copy_loops = *(COUNT);
uint32_t _i;