1
1

Move the main pack/unpack functions in the datatype.h file and make them static inline.

Move the logic for several iovec directly in each specialized pack/unpack function.
Disable the boundaries checking.
Improuve the packing of contiguous messages.

This commit was SVN r3504.
Этот коммит содержится в:
George Bosilca 2004-11-03 21:57:50 +00:00
родитель 818b06803f
Коммит 52c54906b9
4 изменённых файлов: 144 добавлений и 146 удалений

Просмотреть файл

@ -190,7 +190,57 @@ OBJ_CLASS_DECLARATION( ompi_convertor_t );
#define ompi_convertor_progress( PCONV, IOVEC, COUNT, PLENGTH, FDSET ) \
(PCONV)->fAdvance( (PCONV), (IOVEC), (COUNT), (PLENGTH), (FDSET) );
/*
* Return 0 if everything went OK and if there is still room before the complete
* conversion of the data (need additional call with others input buffers )
* 1 if everything went fine and the data was completly converted
* -1 something wrong occurs.
*/
static inline int ompi_convertor_pack( ompi_convertor_t* pConv,
struct iovec* iov, uint32_t* out_size,
uint32_t* max_data, int32_t* freeAfter )
{
/* protect against over packing data */
if( pConv->bConverted == (pConv->pDesc->size * pConv->count) ) {
iov[0].iov_len = 0;
*out_size = 0;
return 1; /* nothing to do */
}
/* We dont allocate any memory. The packing function should allocate it
* if it need. If it's possible to find iovec in the derived datatype
* description then we dont have to allocate any memory.
*/
return pConv->fAdvance( pConv, iov, out_size, max_data, freeAfter );
}
static inline int ompi_convertor_unpack( ompi_convertor_t* pConv,
struct iovec* iov, uint32_t* out_size,
uint32_t* max_data, int32_t* freeAfter )
{
dt_desc_t *pData = pConv->pDesc;
uint32_t length;
/* protect against over unpacking data */
if( pConv->bConverted == (pData->size * pConv->count) ) {
iov[0].iov_len = 0;
*max_data = 0;
return 1; /* nothing to do */
}
if( pConv->flags & DT_FLAG_CONTIGUOUS ) {
if( iov[0].iov_base == NULL ) {
length = pConv->count * pData->size - pConv->bConverted;
iov[0].iov_base = pConv->pBaseBuf + pData->true_lb + pConv->bConverted;
if( iov[0].iov_len < length )
length = iov[0].iov_len;
iov[0].iov_len = length;
pConv->bConverted += length;
return (pConv->bConverted == (pData->size * pConv->count));
}
}
return pConv->fAdvance( pConv, iov, out_size, max_data, freeAfter );
}
/* and finally the convertor functions */
OMPI_DECLSPEC ompi_convertor_t* ompi_convertor_create( int remote_arch, int mode );
@ -205,16 +255,6 @@ OMPI_DECLSPEC int ompi_convertor_init_for_recv( ompi_convertor_t* pConv, unsigne
void* pUserBuf, int remote_starting_point,
memalloc_fct_t allocfn );
OMPI_DECLSPEC int ompi_convertor_need_buffers( ompi_convertor_t* pConvertor );
OMPI_DECLSPEC int ompi_convertor_unpack( ompi_convertor_t* pConv,
struct iovec* out,
unsigned int* out_size,
unsigned int* max_data,
int* freeAfter );
OMPI_DECLSPEC int ompi_convertor_pack( ompi_convertor_t* pConv,
struct iovec* in,
unsigned int* in_size,
unsigned int* max_data,
int* freeAfter );
OMPI_DECLSPEC int ompi_convertor_get_packed_size( ompi_convertor_t* pConv, unsigned int* pSize );
OMPI_DECLSPEC int ompi_convertor_get_unpacked_size( ompi_convertor_t* pConv, unsigned int* pSize );
@ -230,12 +270,6 @@ OMPI_DECLSPEC int ompi_ddt_set_args( dt_desc_t* pData,
OMPI_DECLSPEC int ompi_ddt_sndrcv( void *sbuf, int scount, ompi_datatype_t* sdtype, void *rbuf,
int rcount, ompi_datatype_t* rdtype, int tag, MPI_Comm comm);
static inline
void* allocate_memory_for_ddt( unsigned int* pSize )
{
if( *pSize == 0 ) return NULL;
return malloc( *pSize );
}
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -190,8 +190,12 @@ do { \
/*printf( "memcpy dest = %p src = %p length = %d\n", (void*)(DST), (void*)(SRC), (int)(BLENGTH) );*/ \
memcpy( (DST), (SRC), (BLENGTH) ); }
#if defined(DO_INTENSIVE_DEBUGGING)
#define OMPI_DDT_SAFEGUARD_POINTER( ACTPTR, LENGTH, INITPTR, PDATA, COUNT ) \
ompi_ddt_safeguard_pointer( (ACTPTR), (LENGTH), (INITPTR), (PDATA), (COUNT) )
#else
#define OMPI_DDT_SAFEGUARD_POINTER( ACTPTR, LENGTH, INITPTR, PDATA, COUNT )
#endif /* DO_INTENSIVE_DEBUGGING */
static inline void ompi_ddt_safeguard_pointer( void* actual_ptr, int length,
void* initial_ptr,
@ -268,35 +272,36 @@ int ompi_convertor_create_stack_with_pos_contig( ompi_convertor_t* pConvertor,
ompi_datatype_t* pData = pConvertor->pDesc;
dt_elem_desc_t* pElems;
uint32_t count;
int32_t index;
long extent;
pStack = pConvertor->pStack;
pStack->count = pConvertor->count;
pStack->index = -1;
pStack[0].count = pConvertor->count;
pStack[0].index = -1;
if( pData->opt_desc.desc != NULL ) {
pElems = pData->opt_desc.desc;
pStack->end_loop = pData->opt_desc.used;
pStack[0].end_loop = pData->opt_desc.used;
} else {
pElems = pData->desc.desc;
pStack->end_loop = pData->desc.used;
pStack[0].end_loop = pData->desc.used;
}
/* Special case for contiguous datatypes */
count = starting_point / pData->size;
extent = pData->ub - pData->lb;
pStack->disp = count * extent;
pStack->count -= count;
count = starting_point - count * pData->size; /* number of bytes after the loop */
pStack[1].index = 0;
pStack[1].count = pElems[count].count - count;
pStack[1].end_loop = pStack->end_loop;
pStack[1].disp = pStack->disp /* the total displacement depending on the already done elements */
+ pData->size - count; /* everything from the beginig of this loop */
pStack[0].disp = count * extent;
pStack[0].count -= count;
/* now compute the number of pending bytes */
count = starting_point - count * pData->size;
pStack[1].index = 0; /* useless */
pStack[1].count = pData->size - count;
pStack[1].end_loop = 0; /* useless */
/* we save the currecnt displacement starting from the begining
* of this data.
*/
pStack[1].disp = count;
pConvertor->bConverted = starting_point;
pConvertor->stack_pos = 1;

Просмотреть файл

@ -574,18 +574,66 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv,
return (pConv->bConverted == (pData->size * pConv->count));
}
/* the Contig versions does not use the stack. They can easily retrieve
/* the contig versions does not use the stack. They can easily retrieve
* the status with just the informations from pConvertor->bConverted.
*/
static
int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
static int
ompi_convertor_pack_no_conv_contig( ompi_convertor_t* pConv,
struct iovec* iov,
uint32_t* out_size,
uint32_t* max_data,
int* freeAfter )
{
dt_desc_t* pData = pConv->pDesc;
dt_stack_t* pStack = pConv->pStack;
char *pSrc;
size_t length = pData->size * pConv->count;
uint32_t iov_count, initial_amount = pConv->bConverted;
pSrc = pConv->pBaseBuf + pStack->disp; /* actual starting point for the conversion */
*freeAfter = 0;
/* There are some optimizations that can be done if the upper level
* does not provide a buffer.
*/
pSrc = pConv->pBaseBuf + pStack[0].disp + pStack[1].disp;
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
if( iov[iov_count].iov_base == NULL ) {
iov[iov_count].iov_base = pSrc;
if( (pConv->bConverted + iov[iov_count].iov_len) > length )
iov[iov_count].iov_len = length - pConv->bConverted;
} else {
/* contiguous data just memcpy the smallest data in the user buffer */
iov[iov_count].iov_len = IMIN( iov[iov_count].iov_len, length );
OMPI_DDT_SAFEGUARD_POINTER( pSrc, iov[iov_count].iov_len,
pConv->pBaseBuf, pData, pConv->count );
MEMCPY( iov[iov_count].iov_base, pSrc, iov[iov_count].iov_len);
}
pConv->bConverted += iov[iov_count].iov_len;
pStack[0].disp += iov[iov_count].iov_len;
pSrc = pConv->pBaseBuf + pStack[0].disp;
if( pConv->bConverted == length ) break;
}
/* the number of complete datatypes still to be copied */
pStack[0].count = pConv->count - (pConv->bConverted / pData->size);
/* the amount of data (in bytes) that still have to be done on the last data */
pStack[1].count = pConv->bConverted - pData->size * pStack[0].count;
pStack[1].disp = pData->size - pStack[1].count;
/* update the return value */
*max_data = pConv->bConverted - initial_amount;
*out_size = iov_count;
return (pConv->bConverted == length);
}
static int
ompi_convertor_pack_no_conv_contig_with_gaps( ompi_convertor_t* pConv,
struct iovec* iov,
uint32_t* out_size,
uint32_t* max_data,
int* freeAfter )
{
dt_desc_t* pData = pConv->pDesc;
dt_stack_t* pStack = &(pConv->pStack[pConv->stack_pos]);
dt_stack_t* pStack = pConv->pStack;
char *pSrc, *pDest;
size_t length = pData->size * pConv->count;
long extent;
@ -595,27 +643,31 @@ int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
i = pConv->bConverted / pData->size; /* how many we already pack */
extent = pData->ub - pData->lb;
pSrc = pConv->pBaseBuf + pStack->disp + pStack->count; /* actual starting point for the conversion */
pSrc = pConv->pBaseBuf + pStack->disp; /* actual starting point for the conversion */
*freeAfter = 0;
/* There are some optimizations that can be done if the upper level
* does not provide a buffer.
*/
pSrc = pConv->pBaseBuf + pStack[0].disp + pStack[1].disp;
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
if( iov[iov_count].iov_base == NULL ) {
/* special case for small data. We avoid allocating memory if we
* can fill the iovec directly with the address of the remaining
* data.
*/
if( (pConv->count - i) < ((*out_size) - iov_count) ) {
if( (uint32_t)pStack->count < ((*out_size) - iov_count) ) {
for( index = iov_count; i < pConv->count; i++, index++ ) {
iov[index].iov_base = pSrc;
iov[index].iov_len = pData->size;
pSrc += extent;
pConv->bConverted += pData->size;
iov[index].iov_base = pSrc + pStack[0].disp + pStack[1].disp;
iov[index].iov_len = pStack[1].count;
pStack[0].disp += extent;
total_bytes_converted += pStack[1].count;
pStack[1].disp = 0; /* reset it for the next round */
pStack[1].count = pData->size;
}
*out_size = iov_count + index;
*max_data = total_bytes_converted + index * pData->size;
pConv->bConverted += total_bytes_converted;
*max_data = total_bytes_converted;
return 1; /* we're done */
}
/* now special case for big contiguous data with gaps around */
@ -648,7 +700,7 @@ int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
if( (long)pData->size == extent ) { /* that really contiguous */
if( iov[iov_count].iov_base == NULL ) {
iov[iov_count].iov_base = pSrc; /* + pConv->bConverted; */
iov[iov_count].iov_base = pSrc;
if( (pConv->bConverted + iov[iov_count].iov_len) > length )
iov[iov_count].iov_len = length - pConv->bConverted;
} else {
@ -696,6 +748,11 @@ int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
iov[iov_count].iov_len -= max_allowed;
total_bytes_converted += iov[iov_count].iov_len;
}
/* Now update the pSrc pointer. At the end of each parth we have to update
* the pStack[0].disp field. BEWARE here we remove the pStack[1].disp as
* it's supposed to be useless from now.
*/
pSrc = pConv->pBaseBuf + pStack[0].disp;
}
*max_data = total_bytes_converted;
pConv->bConverted += total_bytes_converted;
@ -703,75 +760,6 @@ int ompi_convertor_pack_no_conversion_contig( ompi_convertor_t* pConv,
return (pConv->bConverted == length);
}
/* The pack routines should do 2 things:
* - first if the provided iovec contains NULL pointers then they should provide
* buffer space. If the data is contiguous the it should provide directly pointers
* the the user space depending on the iov_len argument. If -1 then all the buffer
* can be supplied in one time, if not several steps need to be executed, it should
* provide the correct pointer every time. But if the user provide a buffer, then
* some parts of the data should be packed inside this buffer, but we still should
* able to have pointers to the user buf on the subsequents calls.
*
* The iovec provided by the upper level can have several meanings:
* - the iov_base field contain a address not NULL, the user have provided some memory.
* Then the iov_len field should be not empty too, and we have to respect the high
* level requirements.
* - if iov_base of the first iovec is NULL then the iov_len provided in the first iovec
* is the maximum amount of data that we will pack. If this field is set to zero,
* then we compute this maximum using the convertor and the amount of data already
* packed.
*
* Return 0 if everything went OK and if there is still room before the complete
* conversion of the data (need additional call with others input buffers )
* 1 if everything went fine and the data was completly converted
* -1 something wrong occurs.
*/
int ompi_convertor_pack( ompi_convertor_t* pConv,
struct iovec* iov,
uint32_t* out_size,
uint32_t* max_data,
int* freeAfter )
{
dt_desc_t* pData = pConv->pDesc;
uint32_t done = 0, index = 0;
*freeAfter = 0; /* nothing to free yet */
/* TODO should use the remote size */
if( pConv->bConverted == (pData->size * pConv->count) ) { /* conversion completed or nothing to do */
iov[0].iov_len = 0;
*out_size = 0;
return 1; /* nothing to do */
}
while( index < (*out_size)) {
if( iov[index].iov_len == 0 ) {
assert( iov[index].iov_base == NULL );
iov[index].iov_len = pConv->count * pData->size - pConv->bConverted;
}
#if defined(ONE_STEP)
{
int howMany = 1;
if( iov[index].iov_base == NULL ) {
iov[index].iov_base = pConv->memAlloc_fn( &(iov[index].iov_len) );
(*freeAfter) |= (1 << index);
}
done = convertor_progress( pConv, &(iov[index]), &howMany, max_data, freeAfter );
index++;
}
#else
/* We dont allocate any memory. The packing function should allocate it
* if it need. If it's possible to find iovec in the derived datatype
* description then we dont have to allocate any memory.
*/
done = ompi_convertor_progress( pConv, &(iov[index]), out_size, max_data, freeAfter );
index += (*out_size);
#endif /* ONE_STEP */
if( done == 1 ) break;
}
*out_size = index;
/*printf( "pack return %d iovec with a length of %d\n", index, *max_data );*/
return done;
}
extern int ompi_ddt_local_sizes[DT_MAX_PREDEFINED];
int ompi_convertor_init_for_send( ompi_convertor_t* pConv,
uint32_t flags,
@ -802,9 +790,13 @@ int ompi_convertor_init_for_send( ompi_convertor_t* pConv,
pConv->converted = 0;
pConv->bConverted = 0;
pConv->memAlloc_fn = allocfn;
pConv->fAdvance = ompi_convertor_pack_homogeneous_with_memcpy;
if( dt->flags & DT_FLAG_CONTIGUOUS ) {
pConv->flags |= DT_FLAG_CONTIGUOUS | CONVERTOR_HOMOGENEOUS;
pConv->fAdvance = ompi_convertor_pack_no_conversion_contig;
if( (pConv->pDesc->ub - pConv->pDesc->lb) == (long)pConv->pDesc->size )
pConv->fAdvance = ompi_convertor_pack_no_conv_contig;
else
pConv->fAdvance = ompi_convertor_pack_no_conv_contig_with_gaps;
return ompi_convertor_create_stack_with_pos_contig( pConv, starting_pos, ompi_ddt_local_sizes );
}
pConv->fAdvance = ompi_convertor_pack_general;

Просмотреть файл

@ -342,39 +342,6 @@ static int ompi_convertor_unpack_homogeneous_contig( ompi_convertor_t* pConv,
return (pConv->bConverted == (pData->size * pConv->count));
}
int ompi_convertor_unpack( ompi_convertor_t* pConvertor,
struct iovec* iov,
uint32_t* out_size,
uint32_t* max_data,
int32_t* freeAfter )
{
dt_desc_t *pData = pConvertor->pDesc;
uint32_t length;
*freeAfter = 0;
if( pConvertor->bConverted == (pData->size * pConvertor->count) ) {
iov[0].iov_len = 0;
*max_data = 0;
return 1; /* nothing to do */
}
if( pConvertor->flags & DT_FLAG_CONTIGUOUS ) {
if( iov[0].iov_base == NULL ) {
length = pConvertor->count * pData->size - pConvertor->bConverted;
iov[0].iov_base = pConvertor->pBaseBuf + pData->true_lb + pConvertor->bConverted;
if( iov[0].iov_len == 0 ) { /* give me the whole buffer */
} else { /* what about the next chunk ? */
if( iov[0].iov_len < length )
length = iov[0].iov_len;
}
iov[0].iov_len = length;
pConvertor->bConverted += length;
return (pConvertor->bConverted == (pData->size * pConvertor->count));
}
}
return ompi_convertor_progress( pConvertor, iov, out_size, max_data, freeAfter );
}
/* Return value:
* 0 : nothing has been done
* positive value: number of item converted.