Move the main pack/unpack functions in the datatype.h file and make them static inline.
Move the logic for several iovec directly in each specialized pack/unpack function. Disable the boundaries checking. Improuve the packing of contiguous messages. This commit was SVN r3504.
Этот коммит содержится в:
родитель
818b06803f
Коммит
52c54906b9
@ -190,7 +190,57 @@ OBJ_CLASS_DECLARATION( ompi_convertor_t );
|
||||
|
||||
#define ompi_convertor_progress( PCONV, IOVEC, COUNT, PLENGTH, FDSET ) \
|
||||
(PCONV)->fAdvance( (PCONV), (IOVEC), (COUNT), (PLENGTH), (FDSET) );
|
||||
/*
|
||||
* Return 0 if everything went OK and if there is still room before the complete
|
||||
* conversion of the data (need additional call with others input buffers )
|
||||
* 1 if everything went fine and the data was completly converted
|
||||
* -1 something wrong occurs.
|
||||
*/
|
||||
static inline int ompi_convertor_pack( ompi_convertor_t* pConv,
|
||||
struct iovec* iov, uint32_t* out_size,
|
||||
uint32_t* max_data, int32_t* freeAfter )
|
||||
{
|
||||
/* protect against over packing data */
|
||||
if( pConv->bConverted == (pConv->pDesc->size * pConv->count) ) {
|
||||
iov[0].iov_len = 0;
|
||||
*out_size = 0;
|
||||
return 1; /* nothing to do */
|
||||
}
|
||||
|
||||
/* We dont allocate any memory. The packing function should allocate it
|
||||
* if it need. If it's possible to find iovec in the derived datatype
|
||||
* description then we dont have to allocate any memory.
|
||||
*/
|
||||
return pConv->fAdvance( pConv, iov, out_size, max_data, freeAfter );
|
||||
}
|
||||
|
||||
static inline int ompi_convertor_unpack( ompi_convertor_t* pConv,
|
||||
struct iovec* iov, uint32_t* out_size,
|
||||
uint32_t* max_data, int32_t* freeAfter )
|
||||
{
|
||||
dt_desc_t *pData = pConv->pDesc;
|
||||
uint32_t length;
|
||||
|
||||
/* protect against over unpacking data */
|
||||
if( pConv->bConverted == (pData->size * pConv->count) ) {
|
||||
iov[0].iov_len = 0;
|
||||
*max_data = 0;
|
||||
return 1; /* nothing to do */
|
||||
}
|
||||
|
||||
if( pConv->flags & DT_FLAG_CONTIGUOUS ) {
|
||||
if( iov[0].iov_base == NULL ) {
|
||||
length = pConv->count * pData->size - pConv->bConverted;
|
||||
iov[0].iov_base = pConv->pBaseBuf + pData->true_lb + pConv->bConverted;
|
||||
if( iov[0].iov_len < length )
|
||||
length = iov[0].iov_len;
|
||||
iov[0].iov_len = length;
|
||||
pConv->bConverted += length;
|
||||
return (pConv->bConverted == (pData->size * pConv->count));
|
||||
}
|
||||
}
|
||||
return pConv->fAdvance( pConv, iov, out_size, max_data, freeAfter );
|
||||
}
|
||||
|
||||
/* and finally the convertor functions */
|
||||
OMPI_DECLSPEC ompi_convertor_t* ompi_convertor_create( int remote_arch, int mode );
|
||||
@ -205,16 +255,6 @@ OMPI_DECLSPEC int ompi_convertor_init_for_recv( ompi_convertor_t* pConv, unsigne
|
||||
void* pUserBuf, int remote_starting_point,
|
||||
memalloc_fct_t allocfn );
|
||||
OMPI_DECLSPEC int ompi_convertor_need_buffers( ompi_convertor_t* pConvertor );
|
||||
OMPI_DECLSPEC int ompi_convertor_unpack( ompi_convertor_t* pConv,
|
||||
struct iovec* out,
|
||||
unsigned int* out_size,
|
||||
unsigned int* max_data,
|
||||
int* freeAfter );
|
||||
OMPI_DECLSPEC int ompi_convertor_pack( ompi_convertor_t* pConv,
|
||||
struct iovec* in,
|
||||
unsigned int* in_size,
|
||||
unsigned int* max_data,
|
||||
int* freeAfter );
|
||||
OMPI_DECLSPEC int ompi_convertor_get_packed_size( ompi_convertor_t* pConv, unsigned int* pSize );
|
||||
OMPI_DECLSPEC int ompi_convertor_get_unpacked_size( ompi_convertor_t* pConv, unsigned int* pSize );
|
||||
|
||||
@ -230,12 +270,6 @@ OMPI_DECLSPEC int ompi_ddt_set_args( dt_desc_t* pData,
|
||||
OMPI_DECLSPEC int ompi_ddt_sndrcv( void *sbuf, int scount, ompi_datatype_t* sdtype, void *rbuf,
|
||||
int rcount, ompi_datatype_t* rdtype, int tag, MPI_Comm comm);
|
||||
|
||||
static inline
|
||||
void* allocate_memory_for_ddt( unsigned int* pSize )
|
||||
{
|
||||
if( *pSize == 0 ) return NULL;
|
||||
return malloc( *pSize );
|
||||
}
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -190,8 +190,12 @@ do { \
|
||||
/*printf( "memcpy dest = %p src = %p length = %d\n", (void*)(DST), (void*)(SRC), (int)(BLENGTH) );*/ \
|
||||
memcpy( (DST), (SRC), (BLENGTH) ); }
|
||||
|
||||
#if defined(DO_INTENSIVE_DEBUGGING)
|
||||
#define OMPI_DDT_SAFEGUARD_POINTER( ACTPTR, LENGTH, INITPTR, PDATA, COUNT ) \
|
||||
ompi_ddt_safeguard_pointer( (ACTPTR), (LENGTH), (INITPTR), (PDATA), (COUNT) )
|
||||
#else
|
||||
#define OMPI_DDT_SAFEGUARD_POINTER( ACTPTR, LENGTH, INITPTR, PDATA, COUNT )
|
||||
#endif /* DO_INTENSIVE_DEBUGGING */
|
||||
|
||||
static inline void ompi_ddt_safeguard_pointer( void* actual_ptr, int length,
|
||||
void* initial_ptr,
|
||||
@ -268,35 +272,36 @@ int ompi_convertor_create_stack_with_pos_contig( ompi_convertor_t* pConvertor,
|
||||
ompi_datatype_t* pData = pConvertor->pDesc;
|
||||
dt_elem_desc_t* pElems;
|
||||
uint32_t count;
|
||||
int32_t index;
|
||||
long extent;
|
||||
|
||||
pStack = pConvertor->pStack;
|
||||
|
||||
pStack->count = pConvertor->count;
|
||||
pStack->index = -1;
|
||||
pStack[0].count = pConvertor->count;
|
||||
pStack[0].index = -1;
|
||||
if( pData->opt_desc.desc != NULL ) {
|
||||
pElems = pData->opt_desc.desc;
|
||||
pStack->end_loop = pData->opt_desc.used;
|
||||
pStack[0].end_loop = pData->opt_desc.used;
|
||||
} else {
|
||||
pElems = pData->desc.desc;
|
||||
pStack->end_loop = pData->desc.used;
|
||||
pStack[0].end_loop = pData->desc.used;
|
||||
}
|
||||
|
||||
/* Special case for contiguous datatypes */
|
||||
count = starting_point / pData->size;
|
||||
extent = pData->ub - pData->lb;
|
||||
|
||||
pStack->disp = count * extent;
|
||||
|
||||
pStack->count -= count;
|
||||
count = starting_point - count * pData->size; /* number of bytes after the loop */
|
||||
pStack[1].index = 0;
|
||||
pStack[1].count = pElems[count].count - count;
|
||||
pStack[1].end_loop = pStack->end_loop;
|
||||
|
||||
pStack[1].disp = pStack->disp /* the total displacement depending on the already done elements */
|
||||
+ pData->size - count; /* everything from the beginig of this loop */
|
||||
pStack[0].disp = count * extent;
|
||||
pStack[0].count -= count;
|
||||
|
||||
/* now compute the number of pending bytes */
|
||||
count = starting_point - count * pData->size;
|
||||
pStack[1].index = 0; /* useless */
|
||||
pStack[1].count = pData->size - count;
|
||||
pStack[1].end_loop = 0; /* useless */
|
||||
/* we save the currecnt displacement starting from the begining
|
||||
* of this data.
|
||||
*/
|
||||
pStack[1].disp = count;
|
||||
|
||||
pConvertor->bConverted = starting_point;
|
||||
pConvertor->stack_pos = 1;
|
||||
|
@ -574,18 +574,66 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv,
|
||||
return (pConv->bConverted == (pData->size * pConv->count));
|
||||
}
|
||||
|
||||
/* the Contig versions does not use the stack. They can easily retrieve
|
||||
/* the contig versions does not use the stack. They can easily retrieve
|
||||
* the status with just the informations from pConvertor->bConverted.
|
||||
*/
|
||||
static
|
||||
int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
|
||||
static int
|
||||
ompi_convertor_pack_no_conv_contig( ompi_convertor_t* pConv,
|
||||
struct iovec* iov,
|
||||
uint32_t* out_size,
|
||||
uint32_t* max_data,
|
||||
int* freeAfter )
|
||||
{
|
||||
dt_desc_t* pData = pConv->pDesc;
|
||||
dt_stack_t* pStack = pConv->pStack;
|
||||
char *pSrc;
|
||||
size_t length = pData->size * pConv->count;
|
||||
uint32_t iov_count, initial_amount = pConv->bConverted;
|
||||
|
||||
pSrc = pConv->pBaseBuf + pStack->disp; /* actual starting point for the conversion */
|
||||
|
||||
*freeAfter = 0;
|
||||
/* There are some optimizations that can be done if the upper level
|
||||
* does not provide a buffer.
|
||||
*/
|
||||
pSrc = pConv->pBaseBuf + pStack[0].disp + pStack[1].disp;
|
||||
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
|
||||
if( iov[iov_count].iov_base == NULL ) {
|
||||
iov[iov_count].iov_base = pSrc;
|
||||
if( (pConv->bConverted + iov[iov_count].iov_len) > length )
|
||||
iov[iov_count].iov_len = length - pConv->bConverted;
|
||||
} else {
|
||||
/* contiguous data just memcpy the smallest data in the user buffer */
|
||||
iov[iov_count].iov_len = IMIN( iov[iov_count].iov_len, length );
|
||||
OMPI_DDT_SAFEGUARD_POINTER( pSrc, iov[iov_count].iov_len,
|
||||
pConv->pBaseBuf, pData, pConv->count );
|
||||
MEMCPY( iov[iov_count].iov_base, pSrc, iov[iov_count].iov_len);
|
||||
}
|
||||
pConv->bConverted += iov[iov_count].iov_len;
|
||||
pStack[0].disp += iov[iov_count].iov_len;
|
||||
pSrc = pConv->pBaseBuf + pStack[0].disp;
|
||||
if( pConv->bConverted == length ) break;
|
||||
}
|
||||
/* the number of complete datatypes still to be copied */
|
||||
pStack[0].count = pConv->count - (pConv->bConverted / pData->size);
|
||||
/* the amount of data (in bytes) that still have to be done on the last data */
|
||||
pStack[1].count = pConv->bConverted - pData->size * pStack[0].count;
|
||||
pStack[1].disp = pData->size - pStack[1].count;
|
||||
/* update the return value */
|
||||
*max_data = pConv->bConverted - initial_amount;
|
||||
*out_size = iov_count;
|
||||
return (pConv->bConverted == length);
|
||||
}
|
||||
|
||||
static int
|
||||
ompi_convertor_pack_no_conv_contig_with_gaps( ompi_convertor_t* pConv,
|
||||
struct iovec* iov,
|
||||
uint32_t* out_size,
|
||||
uint32_t* max_data,
|
||||
int* freeAfter )
|
||||
{
|
||||
dt_desc_t* pData = pConv->pDesc;
|
||||
dt_stack_t* pStack = &(pConv->pStack[pConv->stack_pos]);
|
||||
dt_stack_t* pStack = pConv->pStack;
|
||||
char *pSrc, *pDest;
|
||||
size_t length = pData->size * pConv->count;
|
||||
long extent;
|
||||
@ -595,27 +643,31 @@ int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
|
||||
|
||||
i = pConv->bConverted / pData->size; /* how many we already pack */
|
||||
extent = pData->ub - pData->lb;
|
||||
pSrc = pConv->pBaseBuf + pStack->disp + pStack->count; /* actual starting point for the conversion */
|
||||
pSrc = pConv->pBaseBuf + pStack->disp; /* actual starting point for the conversion */
|
||||
|
||||
*freeAfter = 0;
|
||||
/* There are some optimizations that can be done if the upper level
|
||||
* does not provide a buffer.
|
||||
*/
|
||||
pSrc = pConv->pBaseBuf + pStack[0].disp + pStack[1].disp;
|
||||
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
|
||||
if( iov[iov_count].iov_base == NULL ) {
|
||||
/* special case for small data. We avoid allocating memory if we
|
||||
* can fill the iovec directly with the address of the remaining
|
||||
* data.
|
||||
*/
|
||||
if( (pConv->count - i) < ((*out_size) - iov_count) ) {
|
||||
if( (uint32_t)pStack->count < ((*out_size) - iov_count) ) {
|
||||
for( index = iov_count; i < pConv->count; i++, index++ ) {
|
||||
iov[index].iov_base = pSrc;
|
||||
iov[index].iov_len = pData->size;
|
||||
pSrc += extent;
|
||||
pConv->bConverted += pData->size;
|
||||
iov[index].iov_base = pSrc + pStack[0].disp + pStack[1].disp;
|
||||
iov[index].iov_len = pStack[1].count;
|
||||
pStack[0].disp += extent;
|
||||
total_bytes_converted += pStack[1].count;
|
||||
pStack[1].disp = 0; /* reset it for the next round */
|
||||
pStack[1].count = pData->size;
|
||||
}
|
||||
*out_size = iov_count + index;
|
||||
*max_data = total_bytes_converted + index * pData->size;
|
||||
pConv->bConverted += total_bytes_converted;
|
||||
*max_data = total_bytes_converted;
|
||||
return 1; /* we're done */
|
||||
}
|
||||
/* now special case for big contiguous data with gaps around */
|
||||
@ -648,7 +700,7 @@ int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
|
||||
|
||||
if( (long)pData->size == extent ) { /* that really contiguous */
|
||||
if( iov[iov_count].iov_base == NULL ) {
|
||||
iov[iov_count].iov_base = pSrc; /* + pConv->bConverted; */
|
||||
iov[iov_count].iov_base = pSrc;
|
||||
if( (pConv->bConverted + iov[iov_count].iov_len) > length )
|
||||
iov[iov_count].iov_len = length - pConv->bConverted;
|
||||
} else {
|
||||
@ -696,6 +748,11 @@ int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
|
||||
iov[iov_count].iov_len -= max_allowed;
|
||||
total_bytes_converted += iov[iov_count].iov_len;
|
||||
}
|
||||
/* Now update the pSrc pointer. At the end of each parth we have to update
|
||||
* the pStack[0].disp field. BEWARE here we remove the pStack[1].disp as
|
||||
* it's supposed to be useless from now.
|
||||
*/
|
||||
pSrc = pConv->pBaseBuf + pStack[0].disp;
|
||||
}
|
||||
*max_data = total_bytes_converted;
|
||||
pConv->bConverted += total_bytes_converted;
|
||||
@ -703,75 +760,6 @@ int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
|
||||
return (pConv->bConverted == length);
|
||||
}
|
||||
|
||||
/* The pack routines should do 2 things:
|
||||
* - first if the provided iovec contains NULL pointers then they should provide
|
||||
* buffer space. If the data is contiguous the it should provide directly pointers
|
||||
* the the user space depending on the iov_len argument. If -1 then all the buffer
|
||||
* can be supplied in one time, if not several steps need to be executed, it should
|
||||
* provide the correct pointer every time. But if the user provide a buffer, then
|
||||
* some parts of the data should be packed inside this buffer, but we still should
|
||||
* able to have pointers to the user buf on the subsequents calls.
|
||||
*
|
||||
* The iovec provided by the upper level can have several meanings:
|
||||
* - the iov_base field contain a address not NULL, the user have provided some memory.
|
||||
* Then the iov_len field should be not empty too, and we have to respect the high
|
||||
* level requirements.
|
||||
* - if iov_base of the first iovec is NULL then the iov_len provided in the first iovec
|
||||
* is the maximum amount of data that we will pack. If this field is set to zero,
|
||||
* then we compute this maximum using the convertor and the amount of data already
|
||||
* packed.
|
||||
*
|
||||
* Return 0 if everything went OK and if there is still room before the complete
|
||||
* conversion of the data (need additional call with others input buffers )
|
||||
* 1 if everything went fine and the data was completly converted
|
||||
* -1 something wrong occurs.
|
||||
*/
|
||||
int ompi_convertor_pack( ompi_convertor_t* pConv,
|
||||
struct iovec* iov,
|
||||
uint32_t* out_size,
|
||||
uint32_t* max_data,
|
||||
int* freeAfter )
|
||||
{
|
||||
dt_desc_t* pData = pConv->pDesc;
|
||||
uint32_t done = 0, index = 0;
|
||||
|
||||
*freeAfter = 0; /* nothing to free yet */
|
||||
/* TODO should use the remote size */
|
||||
if( pConv->bConverted == (pData->size * pConv->count) ) { /* conversion completed or nothing to do */
|
||||
iov[0].iov_len = 0;
|
||||
*out_size = 0;
|
||||
return 1; /* nothing to do */
|
||||
}
|
||||
while( index < (*out_size)) {
|
||||
if( iov[index].iov_len == 0 ) {
|
||||
assert( iov[index].iov_base == NULL );
|
||||
iov[index].iov_len = pConv->count * pData->size - pConv->bConverted;
|
||||
}
|
||||
#if defined(ONE_STEP)
|
||||
{
|
||||
int howMany = 1;
|
||||
if( iov[index].iov_base == NULL ) {
|
||||
iov[index].iov_base = pConv->memAlloc_fn( &(iov[index].iov_len) );
|
||||
(*freeAfter) |= (1 << index);
|
||||
}
|
||||
done = convertor_progress( pConv, &(iov[index]), &howMany, max_data, freeAfter );
|
||||
index++;
|
||||
}
|
||||
#else
|
||||
/* We dont allocate any memory. The packing function should allocate it
|
||||
* if it need. If it's possible to find iovec in the derived datatype
|
||||
* description then we dont have to allocate any memory.
|
||||
*/
|
||||
done = ompi_convertor_progress( pConv, &(iov[index]), out_size, max_data, freeAfter );
|
||||
index += (*out_size);
|
||||
#endif /* ONE_STEP */
|
||||
if( done == 1 ) break;
|
||||
}
|
||||
*out_size = index;
|
||||
/*printf( "pack return %d iovec with a length of %d\n", index, *max_data );*/
|
||||
return done;
|
||||
}
|
||||
|
||||
extern int ompi_ddt_local_sizes[DT_MAX_PREDEFINED];
|
||||
int ompi_convertor_init_for_send( ompi_convertor_t* pConv,
|
||||
uint32_t flags,
|
||||
@ -802,9 +790,13 @@ int ompi_convertor_init_for_send( ompi_convertor_t* pConv,
|
||||
pConv->converted = 0;
|
||||
pConv->bConverted = 0;
|
||||
pConv->memAlloc_fn = allocfn;
|
||||
pConv->fAdvance = ompi_convertor_pack_homogeneous_with_memcpy;
|
||||
if( dt->flags & DT_FLAG_CONTIGUOUS ) {
|
||||
pConv->flags |= DT_FLAG_CONTIGUOUS | CONVERTOR_HOMOGENEOUS;
|
||||
pConv->fAdvance = ompi_convertor_pack_no_conversion_contig;
|
||||
if( (pConv->pDesc->ub - pConv->pDesc->lb) == (long)pConv->pDesc->size )
|
||||
pConv->fAdvance = ompi_convertor_pack_no_conv_contig;
|
||||
else
|
||||
pConv->fAdvance = ompi_convertor_pack_no_conv_contig_with_gaps;
|
||||
return ompi_convertor_create_stack_with_pos_contig( pConv, starting_pos, ompi_ddt_local_sizes );
|
||||
}
|
||||
pConv->fAdvance = ompi_convertor_pack_general;
|
||||
|
@ -342,39 +342,6 @@ static int ompi_convertor_unpack_homogeneous_contig( ompi_convertor_t* pConv,
|
||||
return (pConv->bConverted == (pData->size * pConv->count));
|
||||
}
|
||||
|
||||
int ompi_convertor_unpack( ompi_convertor_t* pConvertor,
|
||||
struct iovec* iov,
|
||||
uint32_t* out_size,
|
||||
uint32_t* max_data,
|
||||
int32_t* freeAfter )
|
||||
{
|
||||
dt_desc_t *pData = pConvertor->pDesc;
|
||||
uint32_t length;
|
||||
|
||||
*freeAfter = 0;
|
||||
if( pConvertor->bConverted == (pData->size * pConvertor->count) ) {
|
||||
iov[0].iov_len = 0;
|
||||
*max_data = 0;
|
||||
return 1; /* nothing to do */
|
||||
}
|
||||
|
||||
if( pConvertor->flags & DT_FLAG_CONTIGUOUS ) {
|
||||
if( iov[0].iov_base == NULL ) {
|
||||
length = pConvertor->count * pData->size - pConvertor->bConverted;
|
||||
iov[0].iov_base = pConvertor->pBaseBuf + pData->true_lb + pConvertor->bConverted;
|
||||
if( iov[0].iov_len == 0 ) { /* give me the whole buffer */
|
||||
} else { /* what about the next chunk ? */
|
||||
if( iov[0].iov_len < length )
|
||||
length = iov[0].iov_len;
|
||||
}
|
||||
iov[0].iov_len = length;
|
||||
pConvertor->bConverted += length;
|
||||
return (pConvertor->bConverted == (pData->size * pConvertor->count));
|
||||
}
|
||||
}
|
||||
return ompi_convertor_progress( pConvertor, iov, out_size, max_data, freeAfter );
|
||||
}
|
||||
|
||||
/* Return value:
|
||||
* 0 : nothing has been done
|
||||
* positive value: number of item converted.
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user