Several changes:
- add more comments on the pack and unpack functions. - remove all pack/unpack versions that are not used anymore. - other various cleanups. - update the safeguard macro (which compute theboundaries of the datatype in order to protect us from accessing memory locations outside of the data). - for the contiguous (with or without gaps) pack and unpack correctly compute the starting point. This commit was SVN r10327.
Этот коммит содержится в:
родитель
3fb5dafdb3
Коммит
88a363fe34
@ -253,8 +253,9 @@ OMPI_DECLSPEC int ompi_ddt_safeguard_pointer_debug_breakpoint( const void* actua
|
||||
{ \
|
||||
char *__lower_bound = (char*)(INITPTR), *__upper_bound; \
|
||||
assert( ((LENGTH) != 0) && ((COUNT) != 0) ); \
|
||||
__lower_bound += (PDATA)->true_lb - (PDATA)->lb; \
|
||||
__upper_bound = (INITPTR) + ((PDATA)->ub - (PDATA)->lb) * ((COUNT) - 1) + (PDATA)->true_ub - (PDATA)->lb; \
|
||||
__lower_bound += (PDATA)->true_lb; \
|
||||
__upper_bound = (INITPTR) + (PDATA)->true_ub + \
|
||||
((PDATA)->ub - (PDATA)->lb) * ((COUNT) - 1); \
|
||||
if( ((ACTPTR) < __lower_bound) || ((ACTPTR) >= __upper_bound) ) { \
|
||||
ompi_ddt_safeguard_pointer_debug_breakpoint( (ACTPTR), (LENGTH), (INITPTR), (PDATA), (COUNT) ); \
|
||||
opal_output( 0, "%s:%d\n\tPointer %p size %d is outside [%p,%p] for\n\tbase ptr %p count %d and data \n", \
|
||||
|
@ -33,570 +33,18 @@ extern int ompi_pack_debug;
|
||||
#include "ompi/datatype/datatype_prototypes.h"
|
||||
|
||||
#if defined(CHECKSUM)
|
||||
#define ompi_pack_general_function ompi_pack_general_checksum
|
||||
#define ompi_pack_homogeneous_with_memcpy_function ompi_pack_homogeneous_with_memcpy_checksum
|
||||
#define ompi_pack_no_conversion_function ompi_pack_no_conversion_checksum
|
||||
#define ompi_pack_homogeneous_contig_function ompi_pack_homogeneous_contig_checksum
|
||||
#define ompi_pack_homogeneous_contig_with_gaps_function ompi_pack_homogeneous_contig_with_gaps_checksum
|
||||
#define ompi_generic_simple_pack_function ompi_generic_simple_pack_checksum
|
||||
#define ompi_generic_simple_pack_function ompi_generic_simple_pack_checksum
|
||||
#else
|
||||
#define ompi_pack_general_function ompi_pack_general
|
||||
#define ompi_pack_homogeneous_with_memcpy_function ompi_pack_homogeneous_with_memcpy
|
||||
#define ompi_pack_no_conversion_function ompi_pack_no_conversion
|
||||
#define ompi_pack_homogeneous_contig_function ompi_pack_homogeneous_contig
|
||||
#define ompi_pack_homogeneous_contig_with_gaps_function ompi_pack_homogeneous_contig_with_gaps
|
||||
#define ompi_generic_simple_pack_function ompi_generic_simple_pack
|
||||
#define ompi_generic_simple_pack_function ompi_generic_simple_pack
|
||||
#endif /* defined(CHECKSUM) */
|
||||
|
||||
|
||||
int32_t
|
||||
ompi_pack_general_function( ompi_convertor_t* pConvertor,
|
||||
struct iovec* iov, uint32_t* out_size,
|
||||
size_t* max_data,
|
||||
int32_t* freeAfter )
|
||||
{
|
||||
dt_stack_t* pStack; /* pointer to the position on the stack */
|
||||
uint32_t pos_desc; /* actual position in the description of the derived datatype */
|
||||
int count_desc; /* the number of items already done in the actual pos_desc */
|
||||
int type = DT_CHAR; /* type at current position */
|
||||
uint32_t advance; /* number of bytes that we should advance the buffer */
|
||||
long disp_desc = 0; /* compute displacement for truncated data */
|
||||
int bConverted = 0; /* number of bytes converted this time */
|
||||
const ompi_datatype_t *pData = pConvertor->pDesc;
|
||||
const ompi_convertor_master_t* master = pConvertor->master;
|
||||
dt_elem_desc_t* pElem;
|
||||
char* pOutput = pConvertor->pBaseBuf;
|
||||
char* pInput;
|
||||
int iCount, rc;
|
||||
uint32_t iov_count, total_bytes_converted = 0;
|
||||
|
||||
DUMP( "convertor_decode( %p, {%p, %d}, %d )\n", (void*)pConvertor,
|
||||
iov[0].iov_base, iov[0].iov_len, *out_size );
|
||||
|
||||
pElem = pData->desc.desc;
|
||||
|
||||
pStack = pConvertor->pStack + pConvertor->stack_pos;
|
||||
pos_desc = pStack->index;
|
||||
disp_desc = pStack->disp;
|
||||
count_desc = pStack->count;
|
||||
pStack--;
|
||||
pConvertor->stack_pos--;
|
||||
|
||||
DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "starting" );
|
||||
DUMP( "remember position on stack %d last_elem at %d\n", pConvertor->stack_pos, pos_desc );
|
||||
DUMP( "top stack info {index = %d, count = %d}\n",
|
||||
pStack->index, pStack->count );
|
||||
|
||||
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
|
||||
bConverted = 0;
|
||||
if( iov[iov_count].iov_base == NULL ) {
|
||||
size_t length = iov[iov_count].iov_len;
|
||||
if( length <= 0 )
|
||||
length = pConvertor->local_size - pConvertor->bConverted - bConverted;
|
||||
if( (*max_data) < length )
|
||||
length = *max_data;
|
||||
iov[iov_count].iov_base = pConvertor->memAlloc_fn( &length, pConvertor->memAlloc_userdata );
|
||||
iov[iov_count].iov_len = length;
|
||||
*freeAfter = (*freeAfter) | ( 1 << iov_count);
|
||||
}
|
||||
pInput = iov[iov_count].iov_base;
|
||||
iCount = iov[iov_count].iov_len;
|
||||
while( 1 ) {
|
||||
if( DT_END_LOOP == pElem[pos_desc].elem.common.type ) { /* end of the current loop */
|
||||
if( --(pStack->count) == 0 ) { /* end of loop */
|
||||
if( pConvertor->stack_pos == 0 )
|
||||
goto complete_loop; /* completed */
|
||||
pConvertor->stack_pos--;
|
||||
pStack--;
|
||||
pos_desc++;
|
||||
} else {
|
||||
pos_desc = pStack->index + 1;
|
||||
if( pStack->index == -1 ) {
|
||||
pStack->disp += (pData->ub - pData->lb);
|
||||
} else {
|
||||
assert( DT_LOOP == pElem[pStack->index].elem.common.type );
|
||||
pStack->disp += pElem[pStack->index].loop.extent;
|
||||
}
|
||||
}
|
||||
count_desc = pElem[pos_desc].elem.count;
|
||||
disp_desc = pElem[pos_desc].elem.disp;
|
||||
}
|
||||
if( DT_LOOP == pElem[pos_desc].elem.common.type ) {
|
||||
do {
|
||||
PUSH_STACK( pStack, pConvertor->stack_pos,
|
||||
pos_desc, DT_LOOP, pElem[pos_desc].loop.loops,
|
||||
pStack->disp, pos_desc + pElem[pos_desc].loop.items + 1);
|
||||
pos_desc++;
|
||||
} while( DT_LOOP == pElem[pos_desc].elem.common.type ); /* let's start another loop */
|
||||
DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loops" );
|
||||
/* update the current state */
|
||||
count_desc = pElem[pos_desc].elem.count;
|
||||
disp_desc = pElem[pos_desc].elem.disp;
|
||||
continue;
|
||||
}
|
||||
while( pElem[pos_desc].elem.common.flags & DT_FLAG_DATA ) {
|
||||
/* now here we have a basic datatype */
|
||||
type = pElem[pos_desc].elem.common.type;
|
||||
rc = master->pFunctions[type]( pConvertor, count_desc,
|
||||
pOutput + pStack->disp + disp_desc,
|
||||
iCount, pElem[pos_desc].elem.extent,
|
||||
pInput, iCount, BASIC_DDT_FROM_ELEM(pElem[pos_desc])->size, &advance );
|
||||
iCount -= advance; /* decrease the available space in the buffer */
|
||||
pInput += advance; /* increase the pointer to the buffer */
|
||||
bConverted += advance;
|
||||
if( rc != count_desc ) {
|
||||
/* not all data has been converted. Keep the state */
|
||||
count_desc -= rc;
|
||||
disp_desc += rc * pElem[pos_desc].elem.extent;
|
||||
if( iCount != 0 )
|
||||
printf( "pack there is still room in the input buffer %d bytes\n", iCount );
|
||||
goto complete_loop;
|
||||
}
|
||||
pos_desc++; /* advance to the next data */
|
||||
count_desc = pElem[pos_desc].elem.count;
|
||||
disp_desc = pElem[pos_desc].elem.disp;
|
||||
if( iCount == 0 ) goto complete_loop; /* break if there is no more data in the buffer */
|
||||
}
|
||||
}
|
||||
complete_loop:
|
||||
pConvertor->bConverted += bConverted; /* update the already converted bytes */
|
||||
iov[iov_count].iov_len = bConverted; /* update the length in the iovec */
|
||||
total_bytes_converted += bConverted;
|
||||
}
|
||||
*max_data = total_bytes_converted;
|
||||
/* out of the loop: we have complete the data conversion or no more space
|
||||
* in the buffer.
|
||||
*/
|
||||
if( pConvertor->local_size == pConvertor->bConverted ) {
|
||||
pConvertor->flags |= CONVERTOR_COMPLETED;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* I complete an element, next step I should go to the next one */
|
||||
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, type, count_desc,
|
||||
disp_desc, pos_desc );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* We suppose here that we work with an already optimized version of the data
|
||||
*/
|
||||
int32_t
|
||||
ompi_pack_homogeneous_with_memcpy_function( ompi_convertor_t* pConv,
|
||||
struct iovec* iov,
|
||||
uint32_t* out_size,
|
||||
size_t* max_data,
|
||||
int* freeAfter )
|
||||
{
|
||||
dt_stack_t* pStack; /* pointer to the position on the stack */
|
||||
uint32_t pos_desc; /* actual position in the description of the derived datatype */
|
||||
int i; /* index for basic elements with extent */
|
||||
int bConverted = 0; /* number of bytes converted/moved this time */
|
||||
long lastDisp = 0, last_count = 0;
|
||||
uint32_t space = iov[0].iov_len, last_blength = 0;
|
||||
char* pDestBuf;
|
||||
const ompi_datatype_t* pData = pConv->pDesc;
|
||||
dt_elem_desc_t* pElems;
|
||||
|
||||
pDestBuf = iov[0].iov_base;
|
||||
|
||||
pElems = pConv->use_desc->desc;
|
||||
|
||||
pStack = pConv->pStack + pConv->stack_pos;
|
||||
pos_desc = pStack->index;
|
||||
lastDisp = pStack->disp;
|
||||
last_count = pStack->count;
|
||||
pStack--;
|
||||
pConv->stack_pos--;
|
||||
|
||||
while( 1 ) {
|
||||
if( DT_END_LOOP == pElems[pos_desc].elem.common.type ) { /* end of the current loop */
|
||||
if( --(pStack->count) == 0 ) { /* end of loop */
|
||||
if( pConv->stack_pos == 0 ) { /* finish everything */
|
||||
last_count = 0;
|
||||
pos_desc = -1;
|
||||
goto end_loop;
|
||||
}
|
||||
pStack--;
|
||||
pConv->stack_pos--;
|
||||
pos_desc++; /* go to the next element */
|
||||
} else {
|
||||
if( pStack->index == -1 ) {
|
||||
pStack->disp += (pData->ub - pData->lb);
|
||||
pos_desc = 0;
|
||||
} else {
|
||||
assert( DT_LOOP == pElems[pStack->index].elem.common.type );
|
||||
pStack->disp += pElems[pStack->index].loop.extent;
|
||||
pos_desc = pStack->index + 1;
|
||||
}
|
||||
}
|
||||
last_count = pElems[pos_desc].elem.count;
|
||||
last_blength = last_count;
|
||||
lastDisp = pStack->disp + pElems[pos_desc].elem.disp;
|
||||
continue;
|
||||
}
|
||||
while( DT_LOOP == pElems[pos_desc].elem.common.type ) {
|
||||
int stop_in_loop = 0;
|
||||
if( pElems[pos_desc].elem.common.flags & DT_FLAG_CONTIGUOUS ) {
|
||||
ddt_endloop_desc_t* end_loop = &(pElems[pos_desc + pElems[pos_desc].loop.items].end_loop);
|
||||
if( (end_loop->size * last_count) > space ) {
|
||||
stop_in_loop = last_count;
|
||||
last_count = space / end_loop->size;
|
||||
}
|
||||
for( i = 0; i < last_count; i++ ) {
|
||||
OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, end_loop->size,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
MEMCPY_CSUM( pDestBuf, pConv->pBaseBuf + lastDisp, end_loop->size, pConv );
|
||||
pDestBuf += end_loop->size; /* size of the contiguous data */
|
||||
lastDisp += pElems[pos_desc].loop.extent;
|
||||
}
|
||||
space -= (end_loop->size * last_count);
|
||||
bConverted += (end_loop->size * last_count);
|
||||
if( stop_in_loop == 0 ) {
|
||||
pos_desc += pElems[pos_desc].loop.items + 1;
|
||||
last_count = pElems[pos_desc].elem.count;
|
||||
continue;
|
||||
}
|
||||
/* mark some of the iterations as completed */
|
||||
last_count = stop_in_loop - last_count;
|
||||
last_blength = 0;
|
||||
/* Save the stack with the correct last_count value. */
|
||||
}
|
||||
PUSH_STACK( pStack, pConv->stack_pos, pos_desc, DT_LOOP, last_count,
|
||||
pStack->disp, pos_desc + pElems[pos_desc].loop.items );
|
||||
pos_desc++;
|
||||
last_count = pElems[pos_desc].elem.count;
|
||||
}
|
||||
/* now here we have a basic datatype */
|
||||
while( pElems[pos_desc].elem.common.flags & DT_FLAG_DATA ) {
|
||||
/* do we have enough space in the buffer ? */
|
||||
last_blength = last_count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size;
|
||||
if( space < last_blength ) {
|
||||
last_blength = last_count;
|
||||
last_count = space / BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size;
|
||||
space -= (last_count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size);
|
||||
last_blength -= last_count;
|
||||
goto end_loop; /* or break whatever but go out of this while */
|
||||
}
|
||||
OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, last_count,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
MEMCPY_CSUM( pDestBuf, pConv->pBaseBuf + lastDisp, last_count, pConv );
|
||||
bConverted += last_blength;
|
||||
space -= last_blength;
|
||||
pDestBuf += last_blength;
|
||||
pos_desc++; /* advance to the next data */
|
||||
lastDisp = pStack->disp + pElems[pos_desc].elem.disp;
|
||||
last_count = pElems[pos_desc].elem.count;
|
||||
}
|
||||
}
|
||||
end_loop:
|
||||
if( last_count != 0 ) { /* save the internal state */
|
||||
OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, last_count,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
MEMCPY_CSUM( pDestBuf, pConv->pBaseBuf + lastDisp, last_count, pConv );
|
||||
bConverted += last_count;
|
||||
lastDisp += last_count;
|
||||
}
|
||||
pConv->bConverted += bConverted; /* update the byte converted field in the convertor */
|
||||
iov[0].iov_len = bConverted; /* update the length in the iovec */
|
||||
*max_data = bConverted;
|
||||
if( pConv->bConverted == pConv->local_size ) {
|
||||
pConv->flags |= CONVERTOR_COMPLETED;
|
||||
return 1;
|
||||
}
|
||||
/* update the current stack position */
|
||||
PUSH_STACK( pStack, pConv->stack_pos, pos_desc, last_blength, pElems[pos_desc].elem.common.type,
|
||||
lastDisp, pos_desc );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define IOVEC_MEM_LIMIT 8192
|
||||
|
||||
/* The basic idea is to pack or return iovec depending on the datatype shape. If the data
|
||||
* is scattered in memory using small chuncks then we have to allocate some space (unless the upper
|
||||
* level provide some) and pack the data inside. If the chunks of data are large enough
|
||||
* then is useless to allocate additional memory and do the memcpy operation. We can simply
|
||||
* return the pointer to the contiguous piece of memory to the upper level.
|
||||
*/
|
||||
int32_t
|
||||
ompi_pack_no_conversion_function( ompi_convertor_t* pConv,
|
||||
struct iovec* iov,
|
||||
uint32_t *out_size,
|
||||
size_t* max_data,
|
||||
int* freeAfter )
|
||||
{
|
||||
dt_stack_t* pStack; /* pointer to the position on the stack */
|
||||
int pos_desc; /* actual position in the description of the derived datatype */
|
||||
int i; /* index for basic elements with extent */
|
||||
uint32_t iov_pos = 0; /* index in the iovec where we put data inside */
|
||||
int bConverted = 0; /* number of bytes converted/moved this time */
|
||||
uint32_t space_on_iovec; /* amount of free space on the current iovec */
|
||||
long lastDisp = 0;
|
||||
uint32_t space = *max_data, last_blength = 0, saveLength;
|
||||
char *destination, *source;
|
||||
const ompi_datatype_t* pData = pConv->pDesc;
|
||||
ddt_elem_desc_t pack_elem;
|
||||
dt_elem_desc_t* pElems;
|
||||
|
||||
pElems = pConv->use_desc->desc;
|
||||
|
||||
pStack = pConv->pStack + pConv->stack_pos;
|
||||
destination = iov[0].iov_base;
|
||||
source = (char*)pConv->pBaseBuf + pStack->disp;
|
||||
|
||||
/* retrieve the context of the last call */
|
||||
pos_desc = pStack->index;
|
||||
pack_elem.count = pStack->count;
|
||||
pack_elem.common.type = pElems[pos_desc].elem.common.type;
|
||||
last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size;
|
||||
lastDisp = pStack->disp;
|
||||
DO_DEBUG( opal_output( 0, "pack_no_conversion stack_pos %d index %d count %d last_blength %ld lastDisp %ld bConverted %d\n",
|
||||
pConv->stack_pos, pStack->index, pStack->count, last_blength, lastDisp,
|
||||
pConv->bConverted ); );
|
||||
saveLength = 0;
|
||||
pStack--;
|
||||
pConv->stack_pos--;
|
||||
|
||||
*freeAfter = (*freeAfter) & ~((1 << (*out_size)) - 1);
|
||||
space_on_iovec = iov[0].iov_len;
|
||||
|
||||
while( pos_desc >= 0 ) {
|
||||
if( DT_END_LOOP == pElems[pos_desc].elem.common.type ) { /* end of the current loop */
|
||||
if( --(pStack->count) == 0 ) { /* end of loop */
|
||||
if( pConv->stack_pos == 0 ) { /* finish everything */
|
||||
if( saveLength != 0 ) {
|
||||
/* there is still a chunk of memory to be handled, but here we dont allocate more
|
||||
* memory. We just copy what we can in the right place and update the values to be
|
||||
* saved on the next round.
|
||||
*/
|
||||
if( iov_pos < (*out_size) ) { /* still some place in the iovec */
|
||||
if( iov[iov_pos].iov_base == NULL ) {
|
||||
/* prepare a new iovec */
|
||||
iov[iov_pos].iov_base = source;
|
||||
iov[iov_pos].iov_len = saveLength;
|
||||
bConverted += saveLength;
|
||||
saveLength = 0;
|
||||
iov_pos++;
|
||||
space_on_iovec = 0;
|
||||
COMPUTE_CSUM( iov[iov_pos].iov_base, iov[iov_pos].iov_len, pConv );
|
||||
/* let's go out of here */
|
||||
} else {
|
||||
uint32_t copy_length = saveLength;
|
||||
if( space_on_iovec < saveLength ) {
|
||||
copy_length = space_on_iovec;
|
||||
}
|
||||
OMPI_DDT_SAFEGUARD_POINTER( source, copy_length,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
DO_DEBUG( opal_output( 0, "1. memcpy( %p, %p, %ld ) bConverted %ld space %ld pConv->bConverted %ld\n", destination, source,
|
||||
copy_length, bConverted, space_on_iovec, pConv->bConverted ); );
|
||||
MEMCPY_CSUM( destination, source, copy_length, pConv );
|
||||
source += copy_length;
|
||||
destination += copy_length;
|
||||
bConverted += copy_length;
|
||||
space_on_iovec -= copy_length;
|
||||
saveLength -= copy_length;
|
||||
}
|
||||
}
|
||||
}
|
||||
iov[iov_pos].iov_len -= space_on_iovec;
|
||||
pack_elem.count = 0;
|
||||
pos_desc = -1;
|
||||
last_blength = 0;
|
||||
goto end_loop;
|
||||
}
|
||||
pConv->stack_pos--;
|
||||
pStack--;
|
||||
} else {
|
||||
pos_desc = pStack->index; /* DT_LOOP index */
|
||||
if( pos_desc == -1 ) {
|
||||
pStack->disp += (pData->ub - pData->lb);
|
||||
} else {
|
||||
assert( DT_LOOP == pElems[pos_desc].elem.common.type );
|
||||
pStack->disp += pElems[pos_desc].loop.extent;
|
||||
}
|
||||
}
|
||||
pos_desc++; /* go to the next element */
|
||||
lastDisp = pStack->disp + pElems[pos_desc].elem.disp;
|
||||
pack_elem.count = pElems[pos_desc].elem.count;
|
||||
last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size;
|
||||
continue; /* next loop */
|
||||
}
|
||||
while( DT_LOOP == pElems[pos_desc].elem.common.type ) {
|
||||
int stop_in_loop = 0;
|
||||
|
||||
/* If the loop container is contiguous then we can do some
|
||||
* optimizations.
|
||||
*/
|
||||
if( pElems[pos_desc].loop.common.flags & DT_FLAG_CONTIGUOUS ) {
|
||||
/* point to the end of loop element */
|
||||
ddt_endloop_desc_t* end_loop = &(pElems[pos_desc + pElems[pos_desc].loop.items].end_loop);
|
||||
if( iov[iov_pos].iov_base == NULL ) {
|
||||
size_t length = iov[iov_pos].iov_len;
|
||||
iov[iov_pos].iov_base = pConv->memAlloc_fn( &length, pConv->memAlloc_userdata );
|
||||
iov[iov_pos].iov_len = length;
|
||||
space_on_iovec = iov[iov_pos].iov_len;
|
||||
destination = iov[iov_pos].iov_base;
|
||||
(*freeAfter) |= (1 << iov_pos);
|
||||
}
|
||||
/* compute the maximum amount of data to be packed */
|
||||
if( (end_loop->size * pack_elem.count) > space_on_iovec ) {
|
||||
stop_in_loop = pack_elem.count;
|
||||
pack_elem.count = space_on_iovec / end_loop->size;
|
||||
}
|
||||
/* Now let's do it */
|
||||
for( i = 0; i < (int)pack_elem.count; i++ ) {
|
||||
OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, end_loop->size,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
DO_DEBUG (opal_output( 0, "2. memcpy( %p, %p, %ld )\n", destination, pConv->pBaseBuf + lastDisp,
|
||||
end_loop->size ); );
|
||||
MEMCPY_CSUM( destination, pConv->pBaseBuf + lastDisp, end_loop->size, pConv );
|
||||
lastDisp += pElems[pos_desc].loop.extent;
|
||||
destination += end_loop->size;
|
||||
}
|
||||
DO_DEBUG( opal_output( 0, "\t\tbConverted %d space %d pConv->bConverted %d\n",
|
||||
bConverted, space_on_iovec, pConv->bConverted ); );
|
||||
i = end_loop->size * pack_elem.count; /* temporary value */
|
||||
space_on_iovec -= i;
|
||||
space -= i;
|
||||
bConverted += i;
|
||||
if( stop_in_loop == 0 ) { /* did I stop before the end */
|
||||
/* the pElems point to the LOOP struct in the begining */
|
||||
pos_desc += pElems[pos_desc].loop.items + 1;
|
||||
pack_elem.count = pElems[pos_desc].elem.count;
|
||||
last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size;
|
||||
lastDisp = pStack->disp + pElems[pos_desc].elem.disp;
|
||||
continue;
|
||||
}
|
||||
/* mark some of the iterations as completed */
|
||||
pack_elem.count = stop_in_loop - pack_elem.count;
|
||||
last_blength = 0;
|
||||
/* Save the stack with the correct count value. */
|
||||
}
|
||||
PUSH_STACK( pStack, pConv->stack_pos, pos_desc, DT_LOOP, pack_elem.count,
|
||||
pStack->disp, pos_desc + pElems[pos_desc].loop.items );
|
||||
pos_desc++;
|
||||
lastDisp = pStack->disp + pElems[pos_desc].elem.disp;
|
||||
pack_elem.count = pElems[pos_desc].elem.count;
|
||||
last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size;
|
||||
}
|
||||
/* now here we have a basic datatype */
|
||||
while( pElems[pos_desc].elem.common.flags & DT_FLAG_DATA ) {
|
||||
/* first let's see if it's contiguous with the previous chunk of memory and
|
||||
* we still have enough room in the buffer...
|
||||
*/
|
||||
if( ((source + saveLength) == (pConv->pBaseBuf + lastDisp))
|
||||
&& ((saveLength + last_blength) <= space_on_iovec)
|
||||
&& (pElems[pos_desc].elem.extent == (long)BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size) ) {
|
||||
/* ok still contiguous and we still have some space on the buffer */
|
||||
saveLength += last_blength;
|
||||
/* nothing else to do, we act the next time */
|
||||
} else {
|
||||
/* Now we have 2 piece of non contiguous memory. One start at source
|
||||
* with a length of saveLength, the other start at
|
||||
* pConv->pBaseBuf + lastDisp with a length of last_blength bytes.
|
||||
* First we have to pack the old buffer and then we should decide
|
||||
* what we do with the new one.
|
||||
*/
|
||||
do {
|
||||
if( iov[iov_pos].iov_base == NULL ) {
|
||||
size_t length;
|
||||
|
||||
if( saveLength > IOVEC_MEM_LIMIT ) {
|
||||
/* If the user didn't provide any memory, then we are free
|
||||
* to handle this case as we want.
|
||||
*/
|
||||
iov[iov_pos].iov_base = source;
|
||||
iov[iov_pos].iov_len = saveLength;
|
||||
source = pConv->pBaseBuf + lastDisp;
|
||||
/* update the pack counters values */
|
||||
bConverted += saveLength;
|
||||
space -= saveLength;
|
||||
saveLength = last_blength;
|
||||
last_blength = 0;
|
||||
if( ++iov_pos == (*out_size) ) goto end_loop;
|
||||
destination = iov[iov_pos].iov_base;
|
||||
space_on_iovec = iov[iov_pos].iov_len;
|
||||
break;
|
||||
}
|
||||
length = iov[iov_pos].iov_len;
|
||||
/* Let's allocate some. */
|
||||
iov[iov_pos].iov_base = pConv->memAlloc_fn( &length, pConv->memAlloc_userdata );
|
||||
iov[iov_pos].iov_len = length;
|
||||
(*freeAfter) |= (1 << iov_pos);
|
||||
destination = iov[iov_pos].iov_base;
|
||||
space_on_iovec = iov[iov_pos].iov_len;
|
||||
}
|
||||
/* In all the others cases we simply copy as much data as possible */
|
||||
if( space_on_iovec > saveLength ) {
|
||||
OMPI_DDT_SAFEGUARD_POINTER( source, saveLength,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
DO_DEBUG( opal_output( 0, "3. memcpy( %p, %p, %ld ) bConverted %ld space %ld pConv->bConverted %ld\n", destination, source,
|
||||
saveLength, bConverted, space_on_iovec, pConv->bConverted ); );
|
||||
MEMCPY_CSUM( destination, source, saveLength, pConv );
|
||||
destination += saveLength;
|
||||
/* update the pack counters values */
|
||||
bConverted += saveLength;
|
||||
space -= saveLength;
|
||||
space_on_iovec -= saveLength;
|
||||
source = pConv->pBaseBuf + lastDisp;
|
||||
saveLength = last_blength;
|
||||
last_blength = 0;
|
||||
break;
|
||||
}
|
||||
OMPI_DDT_SAFEGUARD_POINTER( source, space_on_iovec,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
DO_DEBUG( opal_output( 0, "4. memcpy( %p, %p, %ld ) bConverted %ld space %ld pConv->bConverted %ld\n", destination, source,
|
||||
space_on_iovec, bConverted, space_on_iovec, pConv->bConverted ); );
|
||||
MEMCPY_CSUM( destination, source, space_on_iovec, pConv );
|
||||
/* let's prepare for the next round. As I keep trace of the amount that I still
|
||||
* have to pack, the next time when I came here, I'll try to append something.
|
||||
* If I already fill-up the amount of data required by the upper level, I will
|
||||
* simply save all informations in the stack, if not I'll take care of allocating
|
||||
* new memory and packing the data inside.
|
||||
*/
|
||||
source += space_on_iovec;
|
||||
saveLength -= space_on_iovec;
|
||||
/* update the pack counters values */
|
||||
bConverted += space_on_iovec;
|
||||
space -= space_on_iovec;
|
||||
lastDisp += space_on_iovec;
|
||||
/* check for the next step */
|
||||
if( ++iov_pos == (*out_size) ) { /* are there more iovecs to fill ? */
|
||||
if( saveLength == 0 ) {
|
||||
lastDisp -= space_on_iovec;
|
||||
saveLength = last_blength;
|
||||
last_blength = 0;
|
||||
}
|
||||
goto end_loop;
|
||||
}
|
||||
destination = iov[iov_pos].iov_base;
|
||||
space_on_iovec = iov[iov_pos].iov_len;
|
||||
} while(1); /* continue forever */
|
||||
}
|
||||
|
||||
if( saveLength > space ) /* this will be the last element copied this time */
|
||||
continue;
|
||||
pos_desc++; /* advance to the next data */
|
||||
lastDisp = pStack->disp + pElems[pos_desc].elem.disp;
|
||||
pack_elem.count = pElems[pos_desc].elem.count;
|
||||
last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size;
|
||||
}
|
||||
}
|
||||
end_loop:
|
||||
assert( last_blength == 0 );
|
||||
pConv->bConverted += bConverted; /* update the byte converted field in the convertor */
|
||||
*max_data = bConverted; /* update the length in the iovec */
|
||||
if( ((*out_size) == iov_pos) || (iov[iov_pos].iov_base == NULL) ) *out_size = iov_pos;
|
||||
else *out_size = iov_pos + 1;
|
||||
if( pConv->bConverted == pConv->local_size ) {
|
||||
pConv->flags |= CONVERTOR_COMPLETED;
|
||||
return 1;
|
||||
}
|
||||
PUSH_STACK( pStack, pConv->stack_pos, pos_desc, pElems[pos_desc].elem.common.type,
|
||||
saveLength, lastDisp, pos_desc );
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* the contig versions does not use the stack. They can easily retrieve
|
||||
* the status with just the informations from pConvertor->bConverted.
|
||||
@ -612,9 +60,11 @@ ompi_pack_homogeneous_contig_function( ompi_convertor_t* pConv,
|
||||
char *source_base = NULL;
|
||||
size_t length = pConv->local_size - pConv->bConverted;
|
||||
uint32_t iov_count, initial_amount = pConv->bConverted;
|
||||
ddt_endloop_desc_t* _end_loop = &(pConv->use_desc->desc[pConv->use_desc->used].end_loop);
|
||||
long initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
|
||||
|
||||
*freeAfter = 0;
|
||||
source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp);
|
||||
|
||||
/* There are some optimizations that can be done if the upper level
|
||||
* does not provide a buffer.
|
||||
*/
|
||||
@ -622,8 +72,6 @@ ompi_pack_homogeneous_contig_function( ompi_convertor_t* pConv,
|
||||
if( 0 == length ) break;
|
||||
if( (size_t)iov[iov_count].iov_len > length )
|
||||
iov[iov_count].iov_len = length;
|
||||
source_base = (pConv->pBaseBuf + _end_loop->first_elem_disp
|
||||
+ pStack[0].disp + pStack[1].disp);
|
||||
if( iov[iov_count].iov_base == NULL ) {
|
||||
iov[iov_count].iov_base = source_base;
|
||||
COMPUTE_CSUM( iov[iov_count].iov_base, iov[iov_count].iov_len, pConv );
|
||||
@ -636,6 +84,7 @@ ompi_pack_homogeneous_contig_function( ompi_convertor_t* pConv,
|
||||
length -= iov[iov_count].iov_len;
|
||||
pConv->bConverted += iov[iov_count].iov_len;
|
||||
pStack[0].disp += iov[iov_count].iov_len;
|
||||
source_base += iov[iov_count].iov_len;
|
||||
}
|
||||
|
||||
/* update the return value */
|
||||
@ -661,6 +110,7 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv,
|
||||
long extent;
|
||||
uint32_t max_allowed, i, index;
|
||||
uint32_t iov_count, total_bytes_converted = 0;
|
||||
long initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
|
||||
|
||||
extent = pData->ub - pData->lb;
|
||||
assert( (pData->flags & DT_FLAG_CONTIGUOUS) && ((long)pData->size != extent) );
|
||||
@ -676,7 +126,7 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv,
|
||||
/* There are some optimizations that can be done if the upper level
|
||||
* does not provide a buffer.
|
||||
*/
|
||||
user_memory = pConv->pBaseBuf + pData->true_lb + pStack[0].disp + pStack[1].disp;
|
||||
user_memory = pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp;
|
||||
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
|
||||
if( 0 == max_allowed ) break; /* we're done this time */
|
||||
if( iov[iov_count].iov_base == NULL ) {
|
||||
@ -687,12 +137,13 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv,
|
||||
if( (uint32_t)pStack->count < ((*out_size) - iov_count) ) {
|
||||
pStack[1].count = pData->size - (pConv->bConverted % pData->size);
|
||||
for( index = iov_count; i < pConv->count; i++, index++ ) {
|
||||
iov[index].iov_base = user_memory + pStack[0].disp + pStack[1].disp;
|
||||
iov[index].iov_base = user_memory;
|
||||
iov[index].iov_len = pStack[1].count;
|
||||
pStack[0].disp += extent;
|
||||
total_bytes_converted += pStack[1].count;
|
||||
pStack[1].disp = 0; /* reset it for the next round */
|
||||
pStack[1].count = pData->size;
|
||||
user_memory = pConv->pBaseBuf + initial_displ + pStack[0].disp;
|
||||
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
|
||||
}
|
||||
*out_size = iov_count + index;
|
||||
@ -760,7 +211,7 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv,
|
||||
i++; /* just to compute the correct source pointer */
|
||||
total_bytes_converted += done;
|
||||
}
|
||||
user_memory = pConv->pBaseBuf + pData->true_lb + i * extent;
|
||||
user_memory = pConv->pBaseBuf + initial_displ + i * extent;
|
||||
counter = max_allowed / pData->size;
|
||||
if( counter > pConv->count ) counter = pConv->count;
|
||||
for( i = 0; i < counter; i++ ) {
|
||||
@ -777,7 +228,7 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv,
|
||||
* the pStack[0].disp field. BEWARE here we remove the pStack[1].disp as
|
||||
* it's supposed to be useless from now.
|
||||
*/
|
||||
user_memory = pConv->pBaseBuf + pData->true_lb + pStack[0].disp;
|
||||
user_memory = pConv->pBaseBuf + initial_displ + pStack[0].disp;
|
||||
}
|
||||
*max_data = total_bytes_converted;
|
||||
pConv->bConverted += total_bytes_converted;
|
||||
|
@ -15,30 +15,6 @@
|
||||
|
||||
#include "ompi_config.h"
|
||||
|
||||
OMPI_DECLSPEC int32_t
|
||||
ompi_pack_general( ompi_convertor_t* pConvertor,
|
||||
struct iovec* iov, uint32_t* out_size,
|
||||
size_t* max_data, int32_t* freeAfter );
|
||||
OMPI_DECLSPEC int32_t
|
||||
ompi_pack_general_checksum( ompi_convertor_t* pConvertor,
|
||||
struct iovec* iov, uint32_t* out_size,
|
||||
size_t* max_data, int32_t* freeAfter );
|
||||
OMPI_DECLSPEC int32_t
|
||||
ompi_pack_homogeneous_with_memcpy( ompi_convertor_t* pConv,
|
||||
struct iovec* iov, uint32_t* out_size,
|
||||
size_t* max_data, int32_t* freeAfter );
|
||||
OMPI_DECLSPEC int32_t
|
||||
ompi_pack_homogeneous_with_memcpy_checksum( ompi_convertor_t* pConv,
|
||||
struct iovec* iov, uint32_t* out_size,
|
||||
size_t* max_data, int32_t* freeAfter );
|
||||
int32_t
|
||||
ompi_pack_no_conversion( ompi_convertor_t* pConv,
|
||||
struct iovec* iov, uint32_t *out_size,
|
||||
size_t* max_data, int32_t* freeAfter );
|
||||
int32_t
|
||||
ompi_pack_no_conversion_checksum( ompi_convertor_t* pConv,
|
||||
struct iovec* iov, uint32_t *out_size,
|
||||
size_t* max_data, int32_t* freeAfter );
|
||||
OMPI_DECLSPEC int32_t
|
||||
ompi_pack_homogeneous_contig( ompi_convertor_t* pConv,
|
||||
struct iovec* iov, uint32_t* out_size,
|
||||
|
@ -70,7 +70,8 @@ ompi_unpack_general_function( ompi_convertor_t* pConvertor,
|
||||
int bConverted = 0; /* number of bytes converted this time */
|
||||
const ompi_convertor_master_t* master = pConvertor->master;
|
||||
dt_elem_desc_t* pElems;
|
||||
int oCount = (pConvertor->pDesc->ub - pConvertor->pDesc->lb) * pConvertor->count;
|
||||
long extent = pConvertor->pDesc->ub - pConvertor->pDesc->lb;
|
||||
int oCount = extent * pConvertor->count;
|
||||
char* pInput;
|
||||
int iCount, rc;
|
||||
uint32_t iov_count, total_bytes_converted = 0;
|
||||
@ -104,7 +105,7 @@ ompi_unpack_general_function( ompi_convertor_t* pConvertor,
|
||||
}
|
||||
|
||||
if( pStack->index == -1 ) {
|
||||
pStack->disp += (pConvertor->pDesc->ub - pConvertor->pDesc->lb);
|
||||
pStack->disp += extent;
|
||||
} else {
|
||||
assert( DT_LOOP == pElems[pStack->index].elem.common.type );
|
||||
pStack->disp += pElems[pStack->index].loop.extent;
|
||||
@ -171,6 +172,15 @@ ompi_unpack_general_function( ompi_convertor_t* pConvertor,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function will be used to unpack all datatypes that have the contiguous flag set.
|
||||
* Several types of datatypes match this criterion, not only the contiguous one, but
|
||||
* the ones that have gaps in the beginning and/or at the end but where the data to
|
||||
* be unpacked is contiguous. However, this function only work for homogeneous cases
|
||||
* and the datatype that are contiguous and where the extent is equal to the size are
|
||||
* taken in account directly in the ompi_convertor_unpack function (in convertor.c) for
|
||||
* the homogeneous case.
|
||||
*/
|
||||
int32_t
|
||||
ompi_unpack_homogeneous_contig_function( ompi_convertor_t* pConv,
|
||||
struct iovec* iov,
|
||||
@ -184,7 +194,7 @@ ompi_unpack_homogeneous_contig_function( ompi_convertor_t* pConv,
|
||||
long extent = pData->ub - pData->lb;
|
||||
uint32_t bConverted, length, remaining, i;
|
||||
dt_stack_t* stack = &(pConv->pStack[1]);
|
||||
ddt_endloop_desc_t* _end_loop = &(pConv->use_desc->desc[pConv->use_desc->used].end_loop);
|
||||
long initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
|
||||
|
||||
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
|
||||
packed_buffer = (char*)iov[iov_count].iov_base;
|
||||
@ -192,7 +202,7 @@ ompi_unpack_homogeneous_contig_function( ompi_convertor_t* pConv,
|
||||
if( remaining > (uint32_t)iov[iov_count].iov_len )
|
||||
remaining = iov[iov_count].iov_len;
|
||||
bConverted = remaining; /* how much will get unpacked this time */
|
||||
user_memory = pConv->pBaseBuf + _end_loop->first_elem_disp;
|
||||
user_memory = pConv->pBaseBuf + initial_displ;
|
||||
|
||||
/*opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %d\n",
|
||||
user_memory, packed_buffer, remaining );*/
|
||||
@ -254,6 +264,18 @@ ompi_unpack_homogeneous_contig_function( ompi_convertor_t* pConv,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function handle partial types. Depending on the send operation it might happens
|
||||
* that we receive only a partial type (always predefined type). In fact the outcome is
|
||||
* that the unpack has to be done in 2 steps. As there is no way to know if the other
|
||||
* part of the datatype is already received, we need to use a trick to handle this special
|
||||
* case. The trick is to fill the missing part with some well known value, unpack the data
|
||||
* as if it was completely received, and then move into the user memory only the bytes
|
||||
* that don't match th wekk known value. This approach work as long as there is no need
|
||||
* for more than structural changes. They will not work for cases where we will have to
|
||||
* change the content of the data (as in all conversions that require changing the size
|
||||
* of the exponent or mantissa).
|
||||
*/
|
||||
static inline uint32_t
|
||||
ompi_unpack_partial_datatype( ompi_convertor_t* pConvertor, dt_elem_desc_t* pElem,
|
||||
char* partial_data,
|
||||
|
@ -166,6 +166,7 @@ int32_t ompi_ddt_add( ompi_datatype_t* pdtBase, const ompi_datatype_t* pdtAdd,
|
||||
*/
|
||||
pdtBase->lb = lb;
|
||||
pdtBase->ub = ub;
|
||||
|
||||
if( 0 == pdtBase->nbElems ) old_true_ub = disp;
|
||||
else old_true_ub = pdtBase->true_ub;
|
||||
pdtBase->true_lb = LMIN( true_lb, pdtBase->true_lb );
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user