Keep only the fastest version of the homogeneous unpack. Reorder the code
to improve the execution path. Some minor clean-ups. This commit was SVN r9773.
Этот коммит содержится в:
родитель
e23c3008a6
Коммит
b990870b55
@ -35,12 +35,10 @@ extern int ompi_unpack_debug;
|
||||
|
||||
#if defined(CHECKSUM)
|
||||
#define ompi_unpack_general_function ompi_unpack_general_checksum
|
||||
#define ompi_unpack_homogeneous_function ompi_unpack_homogeneous_checksum
|
||||
#define ompi_unpack_homogeneous_contig_function ompi_unpack_homogeneous_contig_checksum
|
||||
#define ompi_generic_simple_unpack_function ompi_generic_simple_unpack_checksum
|
||||
#else
|
||||
#define ompi_unpack_general_function ompi_unpack_general
|
||||
#define ompi_unpack_homogeneous_function ompi_unpack_homogeneous
|
||||
#define ompi_unpack_homogeneous_contig_function ompi_unpack_homogeneous_contig
|
||||
#define ompi_generic_simple_unpack_function ompi_generic_simple_unpack
|
||||
#endif /* defined(CHECKSUM) */
|
||||
@ -173,155 +171,6 @@ ompi_unpack_general_function( ompi_convertor_t* pConvertor,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ompi_unpack_homogeneous_function( ompi_convertor_t* pConv,
|
||||
struct iovec* iov,
|
||||
uint32_t* out_size,
|
||||
size_t* max_data,
|
||||
int32_t* freeAfter )
|
||||
{
|
||||
dt_stack_t* pStack; /* pointer to the position on the stack */
|
||||
uint32_t pos_desc; /* actual position in the description of the derived datatype */
|
||||
uint32_t i; /* counter for basic datatype with extent */
|
||||
int bConverted = 0; /* number of bytes converted this time */
|
||||
long lastDisp = 0;
|
||||
size_t space = iov[0].iov_len, last_count = 0, last_blength = 0;
|
||||
char* pSrcBuf;
|
||||
const ompi_datatype_t* pData = pConv->pDesc;
|
||||
dt_elem_desc_t* pElems;
|
||||
|
||||
pSrcBuf = iov[0].iov_base;
|
||||
|
||||
pElems = pConv->use_desc->desc;
|
||||
pStack = pConv->pStack + pConv->stack_pos;
|
||||
pos_desc = pStack->index;
|
||||
lastDisp = pStack->disp;
|
||||
last_count = pStack->count;
|
||||
/*opal_output( 0, "ompi_convertor_unpack_homogeneous stack_pos %d index %d count %d lastDisp %ld bConverted %d\n",
|
||||
pConv->stack_pos, pStack->index, pStack->count, lastDisp, pConv->bConverted );*/
|
||||
pStack--;
|
||||
pConv->stack_pos--;
|
||||
|
||||
while( 1 ) { /* loop forever. The exit condition is detected inside the while loop */
|
||||
if( DT_END_LOOP == pElems[pos_desc].elem.common.type ) { /* end of the current loop */
|
||||
if( --(pStack->count) == 0 ) { /* end of loop */
|
||||
if( pConv->stack_pos == 0 ) {
|
||||
last_blength = 0; /* nothing to copy anymore */
|
||||
goto end_loop;
|
||||
}
|
||||
pStack--;
|
||||
pConv->stack_pos--;
|
||||
pos_desc++;
|
||||
} else {
|
||||
if( pStack->index == -1 ) {
|
||||
pStack->disp += (pData->ub - pData->lb);
|
||||
} else {
|
||||
assert( DT_LOOP == pElems[pStack->index].elem.common.type );
|
||||
pStack->disp += pElems[pStack->index].loop.extent;
|
||||
}
|
||||
pos_desc = pStack->index + 1;
|
||||
}
|
||||
lastDisp = pStack->disp + pElems[pos_desc].elem.disp;
|
||||
last_count = pElems[pos_desc].elem.count;
|
||||
continue;
|
||||
}
|
||||
while( DT_LOOP == pElems[pos_desc].elem.common.type ) {
|
||||
int stop_in_loop = 0;
|
||||
if( pElems[pos_desc].loop.common.flags & DT_FLAG_CONTIGUOUS ) {
|
||||
ddt_endloop_desc_t* end_loop = &(pElems[pos_desc + pElems[pos_desc].loop.items].end_loop);
|
||||
last_count = pElems[pos_desc].loop.loops;
|
||||
if( (end_loop->size * last_count) > space ) {
|
||||
stop_in_loop = last_count;
|
||||
last_count = space / end_loop->size;
|
||||
}
|
||||
for( i = 0; i < last_count; i++ ) {
|
||||
OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, end_loop->size,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
/*opal_output( 0, "3. memcpy %p, %p, %d", pConv->pBaseBuf + lastDisp, pSrcBuf, end_loop->size );*/
|
||||
MEMCPY_CSUM( pConv->pBaseBuf + lastDisp, pSrcBuf, end_loop->size, pConv );
|
||||
pSrcBuf += end_loop->size;
|
||||
lastDisp += pElems[pos_desc].loop.extent;
|
||||
}
|
||||
space -= (end_loop->size * last_count);
|
||||
bConverted += (end_loop->size * last_count);
|
||||
if( stop_in_loop == 0 ) {
|
||||
pos_desc += pElems[pos_desc].loop.items + 1;
|
||||
last_count = pElems[pos_desc].elem.count;
|
||||
continue;
|
||||
}
|
||||
last_count = stop_in_loop - last_count;
|
||||
last_blength = 0;
|
||||
/* Save the stack with the correct last_count value. */
|
||||
}
|
||||
PUSH_STACK( pStack, pConv->stack_pos, pos_desc, DT_LOOP, last_count,
|
||||
pStack->disp, pos_desc + pElems[pos_desc].loop.items );
|
||||
pos_desc++;
|
||||
lastDisp = pStack->disp + pElems[pos_desc].elem.disp;
|
||||
last_count = pElems[pos_desc].elem.count;
|
||||
}
|
||||
/* now here we have a basic datatype */
|
||||
while( pElems[pos_desc].elem.common.flags & DT_FLAG_DATA ) {
|
||||
const ompi_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]);
|
||||
/* do we have enough space in the buffer ? */
|
||||
last_blength = last_count * basic_type->size;
|
||||
if( pElems[pos_desc].elem.common.flags & DT_FLAG_CONTIGUOUS ) {
|
||||
if( space < last_blength ) {
|
||||
last_blength = space / basic_type->size;
|
||||
last_count -= last_blength;
|
||||
last_blength *= basic_type->size;
|
||||
space -= last_blength;
|
||||
goto end_loop; /* or break whatever but go out of this while */
|
||||
}
|
||||
OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, last_blength,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
/*opal_output( 0, "1. memcpy %p, %p, %d -> %d", pConv->pBaseBuf + lastDisp, pSrcBuf, last_blength, bConverted );*/
|
||||
MEMCPY_CSUM( pConv->pBaseBuf + lastDisp, pSrcBuf, last_blength, pConv );
|
||||
bConverted += last_blength;
|
||||
space -= last_blength;
|
||||
pSrcBuf += last_blength;
|
||||
} else {
|
||||
uint32_t i;
|
||||
|
||||
last_blength = basic_type->size;
|
||||
for( i = 0; i < last_count; i++ ) {
|
||||
OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, last_blength,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
/*opal_output( 0, "2. memcpy %p, %p, %d", pConv->pBaseBuf + lastDisp, pSrcBuf, last_blength );*/
|
||||
MEMCPY_CSUM( pConv->pBaseBuf + lastDisp, pSrcBuf, last_blength, pConv );
|
||||
lastDisp += pElems[pos_desc].elem.extent;
|
||||
pSrcBuf += basic_type->size;
|
||||
}
|
||||
bConverted += basic_type->size * last_count;
|
||||
}
|
||||
pos_desc++; /* advance to the next data */
|
||||
lastDisp = pStack->disp + pElems[pos_desc].elem.disp;
|
||||
last_count = pElems[pos_desc].elem.count;
|
||||
}
|
||||
}
|
||||
end_loop:
|
||||
if( last_blength != 0 ) { /* save the internal state */
|
||||
/* update corresponding the the datatype length */
|
||||
OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, last_blength,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
MEMCPY_CSUM( pConv->pBaseBuf + lastDisp, pSrcBuf, last_blength, pConv );
|
||||
/*opal_output( 0, "1. memcpy %p, %p, %d -> %d", pConv->pBaseBuf + lastDisp, pSrcBuf, last_blength, bConverted );*/
|
||||
bConverted += last_blength;
|
||||
lastDisp += last_blength;
|
||||
}
|
||||
|
||||
pConv->bConverted += bConverted; /* update the converted field */
|
||||
iov[0].iov_len = bConverted; /* update the iovec length */
|
||||
*max_data = bConverted;
|
||||
|
||||
if( pConv->bConverted == pConv->remote_size ) {
|
||||
pConv->flags |= CONVERTOR_COMPLETED;
|
||||
return 1;
|
||||
}
|
||||
PUSH_STACK( pStack, pConv->stack_pos, pos_desc, pElems[pos_desc].elem.common.type,
|
||||
last_count, lastDisp, pos_desc );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ompi_unpack_homogeneous_contig_function( ompi_convertor_t* pConv,
|
||||
struct iovec* iov,
|
||||
@ -483,9 +332,36 @@ ompi_generic_simple_unpack_function( ompi_convertor_t* pConvertor,
|
||||
assert( 0 == element_length );
|
||||
packed_buffer = (char*)iov[iov_count].iov_base + missing_length;
|
||||
iov_len_local -= missing_length;
|
||||
pConvertor->bConverted += element_length;
|
||||
pConvertor->storage.length = 0; /* nothing more inside */
|
||||
}
|
||||
while( 1 ) {
|
||||
while( pElem->elem.common.flags & DT_FLAG_DATA ) {
|
||||
/* now here we have a basic datatype */
|
||||
UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
|
||||
packed_buffer, user_memory_base, iov_len_local );
|
||||
if( 0 == count_desc ) { /* completed */
|
||||
user_memory_base = pConvertor->pBaseBuf + pStack->disp;
|
||||
pos_desc++; /* advance to the next data */
|
||||
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
|
||||
continue;
|
||||
}
|
||||
type = pElem->elem.common.type;
|
||||
assert (type < DT_MAX_PREDEFINED);
|
||||
required_space = ompi_ddt_basicDatatypes[type]->size;
|
||||
if( 0 != iov_len_local ) {
|
||||
/* We have some partial data here. Let's copy it into the convertor
|
||||
* and keep it hot until the next round.
|
||||
*/
|
||||
assert (type < DT_MAX_PREDEFINED);
|
||||
assert( iov_len_local < ompi_ddt_basicDatatypes[type]->size );
|
||||
MEMCPY_CSUM( pConvertor->storage.data, packed_buffer, iov_len_local, pConvertor );
|
||||
DO_DEBUG( opal_output( 0, "Saving %d bytes for the next call\n", iov_len_local ); );
|
||||
pConvertor->storage.length = iov_len_local;
|
||||
iov_len_local = 0;
|
||||
}
|
||||
goto complete_loop;
|
||||
}
|
||||
if( DT_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
|
||||
DO_DEBUG( opal_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %d\n",
|
||||
pStack->count, pConvertor->stack_pos, pos_desc, pStack->disp, iov_len_local ); );
|
||||
@ -535,31 +411,6 @@ ompi_generic_simple_unpack_function( ompi_convertor_t* pConvertor,
|
||||
DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
|
||||
continue;
|
||||
}
|
||||
while( pElem->elem.common.flags & DT_FLAG_DATA ) {
|
||||
/* now here we have a basic datatype */
|
||||
UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
|
||||
packed_buffer, user_memory_base, iov_len_local );
|
||||
if( 0 != count_desc ) { /* completed */
|
||||
type = pElem->elem.common.type;
|
||||
assert (type < DT_MAX_PREDEFINED);
|
||||
required_space = ompi_ddt_basicDatatypes[type]->size;
|
||||
if( 0 != iov_len_local ) {
|
||||
/* We have some partial data here. Let's copy it into the convertor
|
||||
* and keep it hot until the next round.
|
||||
*/
|
||||
assert (type < DT_MAX_PREDEFINED);
|
||||
assert( iov_len_local < ompi_ddt_basicDatatypes[type]->size );
|
||||
MEMCPY_CSUM( pConvertor->storage.data, packed_buffer, iov_len_local, pConvertor );
|
||||
DO_DEBUG( opal_output( 0, "Saving %d bytes for the next call\n", iov_len_local ); );
|
||||
pConvertor->storage.length = iov_len_local;
|
||||
iov_len_local = 0;
|
||||
}
|
||||
goto complete_loop;
|
||||
}
|
||||
user_memory_base = pConvertor->pBaseBuf + pStack->disp;
|
||||
pos_desc++; /* advance to the next data */
|
||||
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
|
||||
}
|
||||
}
|
||||
complete_loop:
|
||||
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user