From 88a363fe34b57150a2205ac393660bd0596c000d Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Tue, 13 Jun 2006 07:23:43 +0000 Subject: [PATCH] Several changes: - add more comments on the pack and unpack functions. - remove all pack/unpack versions that are not used anymore. - other various cleanups. - update the safeguard macro (which compute theboundaries of the datatype in order to protect us from accessing memory locations outside of the data). - for the contiguous (with or without gaps) pack and unpack correctly compute the starting point. This commit was SVN r10327. --- ompi/datatype/datatype_internal.h | 5 +- ompi/datatype/datatype_pack.c | 573 +--------------------------- ompi/datatype/datatype_prototypes.h | 24 -- ompi/datatype/datatype_unpack.c | 30 +- ompi/datatype/dt_add.c | 1 + 5 files changed, 42 insertions(+), 591 deletions(-) diff --git a/ompi/datatype/datatype_internal.h b/ompi/datatype/datatype_internal.h index 1662beb624..c2dc422d38 100644 --- a/ompi/datatype/datatype_internal.h +++ b/ompi/datatype/datatype_internal.h @@ -253,8 +253,9 @@ OMPI_DECLSPEC int ompi_ddt_safeguard_pointer_debug_breakpoint( const void* actua { \ char *__lower_bound = (char*)(INITPTR), *__upper_bound; \ assert( ((LENGTH) != 0) && ((COUNT) != 0) ); \ - __lower_bound += (PDATA)->true_lb - (PDATA)->lb; \ - __upper_bound = (INITPTR) + ((PDATA)->ub - (PDATA)->lb) * ((COUNT) - 1) + (PDATA)->true_ub - (PDATA)->lb; \ + __lower_bound += (PDATA)->true_lb; \ + __upper_bound = (INITPTR) + (PDATA)->true_ub + \ + ((PDATA)->ub - (PDATA)->lb) * ((COUNT) - 1); \ if( ((ACTPTR) < __lower_bound) || ((ACTPTR) >= __upper_bound) ) { \ ompi_ddt_safeguard_pointer_debug_breakpoint( (ACTPTR), (LENGTH), (INITPTR), (PDATA), (COUNT) ); \ opal_output( 0, "%s:%d\n\tPointer %p size %d is outside [%p,%p] for\n\tbase ptr %p count %d and data \n", \ diff --git a/ompi/datatype/datatype_pack.c b/ompi/datatype/datatype_pack.c index 3bb1edc062..9a63c1654c 100644 --- a/ompi/datatype/datatype_pack.c +++ b/ompi/datatype/datatype_pack.c @@ -33,570 +33,18 @@ extern int ompi_pack_debug; #include "ompi/datatype/datatype_prototypes.h" #if defined(CHECKSUM) -#define ompi_pack_general_function ompi_pack_general_checksum -#define ompi_pack_homogeneous_with_memcpy_function ompi_pack_homogeneous_with_memcpy_checksum -#define ompi_pack_no_conversion_function ompi_pack_no_conversion_checksum #define ompi_pack_homogeneous_contig_function ompi_pack_homogeneous_contig_checksum #define ompi_pack_homogeneous_contig_with_gaps_function ompi_pack_homogeneous_contig_with_gaps_checksum -#define ompi_generic_simple_pack_function ompi_generic_simple_pack_checksum +#define ompi_generic_simple_pack_function ompi_generic_simple_pack_checksum #else -#define ompi_pack_general_function ompi_pack_general -#define ompi_pack_homogeneous_with_memcpy_function ompi_pack_homogeneous_with_memcpy -#define ompi_pack_no_conversion_function ompi_pack_no_conversion #define ompi_pack_homogeneous_contig_function ompi_pack_homogeneous_contig #define ompi_pack_homogeneous_contig_with_gaps_function ompi_pack_homogeneous_contig_with_gaps -#define ompi_generic_simple_pack_function ompi_generic_simple_pack +#define ompi_generic_simple_pack_function ompi_generic_simple_pack #endif /* defined(CHECKSUM) */ -int32_t -ompi_pack_general_function( ompi_convertor_t* pConvertor, - struct iovec* iov, uint32_t* out_size, - size_t* max_data, - int32_t* freeAfter ) -{ - dt_stack_t* pStack; /* pointer to the position on the stack */ - uint32_t pos_desc; /* actual position in the description of the derived datatype */ - int count_desc; /* the number of items already done in the actual pos_desc */ - int type = DT_CHAR; /* type at current position */ - uint32_t advance; /* number of bytes that we should advance the buffer */ - long disp_desc = 0; /* compute displacement for truncated data */ - int bConverted = 0; /* number of bytes converted this time */ - const ompi_datatype_t *pData = pConvertor->pDesc; - const ompi_convertor_master_t* master = pConvertor->master; - dt_elem_desc_t* pElem; - char* pOutput = pConvertor->pBaseBuf; - char* pInput; - int iCount, rc; - uint32_t iov_count, total_bytes_converted = 0; - - DUMP( "convertor_decode( %p, {%p, %d}, %d )\n", (void*)pConvertor, - iov[0].iov_base, iov[0].iov_len, *out_size ); - - pElem = pData->desc.desc; - - pStack = pConvertor->pStack + pConvertor->stack_pos; - pos_desc = pStack->index; - disp_desc = pStack->disp; - count_desc = pStack->count; - pStack--; - pConvertor->stack_pos--; - - DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "starting" ); - DUMP( "remember position on stack %d last_elem at %d\n", pConvertor->stack_pos, pos_desc ); - DUMP( "top stack info {index = %d, count = %d}\n", - pStack->index, pStack->count ); - - for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { - bConverted = 0; - if( iov[iov_count].iov_base == NULL ) { - size_t length = iov[iov_count].iov_len; - if( length <= 0 ) - length = pConvertor->local_size - pConvertor->bConverted - bConverted; - if( (*max_data) < length ) - length = *max_data; - iov[iov_count].iov_base = pConvertor->memAlloc_fn( &length, pConvertor->memAlloc_userdata ); - iov[iov_count].iov_len = length; - *freeAfter = (*freeAfter) | ( 1 << iov_count); - } - pInput = iov[iov_count].iov_base; - iCount = iov[iov_count].iov_len; - while( 1 ) { - if( DT_END_LOOP == pElem[pos_desc].elem.common.type ) { /* end of the current loop */ - if( --(pStack->count) == 0 ) { /* end of loop */ - if( pConvertor->stack_pos == 0 ) - goto complete_loop; /* completed */ - pConvertor->stack_pos--; - pStack--; - pos_desc++; - } else { - pos_desc = pStack->index + 1; - if( pStack->index == -1 ) { - pStack->disp += (pData->ub - pData->lb); - } else { - assert( DT_LOOP == pElem[pStack->index].elem.common.type ); - pStack->disp += pElem[pStack->index].loop.extent; - } - } - count_desc = pElem[pos_desc].elem.count; - disp_desc = pElem[pos_desc].elem.disp; - } - if( DT_LOOP == pElem[pos_desc].elem.common.type ) { - do { - PUSH_STACK( pStack, pConvertor->stack_pos, - pos_desc, DT_LOOP, pElem[pos_desc].loop.loops, - pStack->disp, pos_desc + pElem[pos_desc].loop.items + 1); - pos_desc++; - } while( DT_LOOP == pElem[pos_desc].elem.common.type ); /* let's start another loop */ - DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loops" ); - /* update the current state */ - count_desc = pElem[pos_desc].elem.count; - disp_desc = pElem[pos_desc].elem.disp; - continue; - } - while( pElem[pos_desc].elem.common.flags & DT_FLAG_DATA ) { - /* now here we have a basic datatype */ - type = pElem[pos_desc].elem.common.type; - rc = master->pFunctions[type]( pConvertor, count_desc, - pOutput + pStack->disp + disp_desc, - iCount, pElem[pos_desc].elem.extent, - pInput, iCount, BASIC_DDT_FROM_ELEM(pElem[pos_desc])->size, &advance ); - iCount -= advance; /* decrease the available space in the buffer */ - pInput += advance; /* increase the pointer to the buffer */ - bConverted += advance; - if( rc != count_desc ) { - /* not all data has been converted. Keep the state */ - count_desc -= rc; - disp_desc += rc * pElem[pos_desc].elem.extent; - if( iCount != 0 ) - printf( "pack there is still room in the input buffer %d bytes\n", iCount ); - goto complete_loop; - } - pos_desc++; /* advance to the next data */ - count_desc = pElem[pos_desc].elem.count; - disp_desc = pElem[pos_desc].elem.disp; - if( iCount == 0 ) goto complete_loop; /* break if there is no more data in the buffer */ - } - } - complete_loop: - pConvertor->bConverted += bConverted; /* update the already converted bytes */ - iov[iov_count].iov_len = bConverted; /* update the length in the iovec */ - total_bytes_converted += bConverted; - } - *max_data = total_bytes_converted; - /* out of the loop: we have complete the data conversion or no more space - * in the buffer. - */ - if( pConvertor->local_size == pConvertor->bConverted ) { - pConvertor->flags |= CONVERTOR_COMPLETED; - return 1; - } - - /* I complete an element, next step I should go to the next one */ - PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, type, count_desc, - disp_desc, pos_desc ); - - return 0; -} - -/* We suppose here that we work with an already optimized version of the data - */ -int32_t -ompi_pack_homogeneous_with_memcpy_function( ompi_convertor_t* pConv, - struct iovec* iov, - uint32_t* out_size, - size_t* max_data, - int* freeAfter ) -{ - dt_stack_t* pStack; /* pointer to the position on the stack */ - uint32_t pos_desc; /* actual position in the description of the derived datatype */ - int i; /* index for basic elements with extent */ - int bConverted = 0; /* number of bytes converted/moved this time */ - long lastDisp = 0, last_count = 0; - uint32_t space = iov[0].iov_len, last_blength = 0; - char* pDestBuf; - const ompi_datatype_t* pData = pConv->pDesc; - dt_elem_desc_t* pElems; - - pDestBuf = iov[0].iov_base; - - pElems = pConv->use_desc->desc; - - pStack = pConv->pStack + pConv->stack_pos; - pos_desc = pStack->index; - lastDisp = pStack->disp; - last_count = pStack->count; - pStack--; - pConv->stack_pos--; - - while( 1 ) { - if( DT_END_LOOP == pElems[pos_desc].elem.common.type ) { /* end of the current loop */ - if( --(pStack->count) == 0 ) { /* end of loop */ - if( pConv->stack_pos == 0 ) { /* finish everything */ - last_count = 0; - pos_desc = -1; - goto end_loop; - } - pStack--; - pConv->stack_pos--; - pos_desc++; /* go to the next element */ - } else { - if( pStack->index == -1 ) { - pStack->disp += (pData->ub - pData->lb); - pos_desc = 0; - } else { - assert( DT_LOOP == pElems[pStack->index].elem.common.type ); - pStack->disp += pElems[pStack->index].loop.extent; - pos_desc = pStack->index + 1; - } - } - last_count = pElems[pos_desc].elem.count; - last_blength = last_count; - lastDisp = pStack->disp + pElems[pos_desc].elem.disp; - continue; - } - while( DT_LOOP == pElems[pos_desc].elem.common.type ) { - int stop_in_loop = 0; - if( pElems[pos_desc].elem.common.flags & DT_FLAG_CONTIGUOUS ) { - ddt_endloop_desc_t* end_loop = &(pElems[pos_desc + pElems[pos_desc].loop.items].end_loop); - if( (end_loop->size * last_count) > space ) { - stop_in_loop = last_count; - last_count = space / end_loop->size; - } - for( i = 0; i < last_count; i++ ) { - OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, end_loop->size, - pConv->pBaseBuf, pData, pConv->count ); - MEMCPY_CSUM( pDestBuf, pConv->pBaseBuf + lastDisp, end_loop->size, pConv ); - pDestBuf += end_loop->size; /* size of the contiguous data */ - lastDisp += pElems[pos_desc].loop.extent; - } - space -= (end_loop->size * last_count); - bConverted += (end_loop->size * last_count); - if( stop_in_loop == 0 ) { - pos_desc += pElems[pos_desc].loop.items + 1; - last_count = pElems[pos_desc].elem.count; - continue; - } - /* mark some of the iterations as completed */ - last_count = stop_in_loop - last_count; - last_blength = 0; - /* Save the stack with the correct last_count value. */ - } - PUSH_STACK( pStack, pConv->stack_pos, pos_desc, DT_LOOP, last_count, - pStack->disp, pos_desc + pElems[pos_desc].loop.items ); - pos_desc++; - last_count = pElems[pos_desc].elem.count; - } - /* now here we have a basic datatype */ - while( pElems[pos_desc].elem.common.flags & DT_FLAG_DATA ) { - /* do we have enough space in the buffer ? */ - last_blength = last_count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; - if( space < last_blength ) { - last_blength = last_count; - last_count = space / BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; - space -= (last_count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size); - last_blength -= last_count; - goto end_loop; /* or break whatever but go out of this while */ - } - OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, last_count, - pConv->pBaseBuf, pData, pConv->count ); - MEMCPY_CSUM( pDestBuf, pConv->pBaseBuf + lastDisp, last_count, pConv ); - bConverted += last_blength; - space -= last_blength; - pDestBuf += last_blength; - pos_desc++; /* advance to the next data */ - lastDisp = pStack->disp + pElems[pos_desc].elem.disp; - last_count = pElems[pos_desc].elem.count; - } - } - end_loop: - if( last_count != 0 ) { /* save the internal state */ - OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, last_count, - pConv->pBaseBuf, pData, pConv->count ); - MEMCPY_CSUM( pDestBuf, pConv->pBaseBuf + lastDisp, last_count, pConv ); - bConverted += last_count; - lastDisp += last_count; - } - pConv->bConverted += bConverted; /* update the byte converted field in the convertor */ - iov[0].iov_len = bConverted; /* update the length in the iovec */ - *max_data = bConverted; - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; - } - /* update the current stack position */ - PUSH_STACK( pStack, pConv->stack_pos, pos_desc, last_blength, pElems[pos_desc].elem.common.type, - lastDisp, pos_desc ); - - return 0; -} - #define IOVEC_MEM_LIMIT 8192 -/* The basic idea is to pack or return iovec depending on the datatype shape. If the data - * is scattered in memory using small chuncks then we have to allocate some space (unless the upper - * level provide some) and pack the data inside. If the chunks of data are large enough - * then is useless to allocate additional memory and do the memcpy operation. We can simply - * return the pointer to the contiguous piece of memory to the upper level. - */ -int32_t -ompi_pack_no_conversion_function( ompi_convertor_t* pConv, - struct iovec* iov, - uint32_t *out_size, - size_t* max_data, - int* freeAfter ) -{ - dt_stack_t* pStack; /* pointer to the position on the stack */ - int pos_desc; /* actual position in the description of the derived datatype */ - int i; /* index for basic elements with extent */ - uint32_t iov_pos = 0; /* index in the iovec where we put data inside */ - int bConverted = 0; /* number of bytes converted/moved this time */ - uint32_t space_on_iovec; /* amount of free space on the current iovec */ - long lastDisp = 0; - uint32_t space = *max_data, last_blength = 0, saveLength; - char *destination, *source; - const ompi_datatype_t* pData = pConv->pDesc; - ddt_elem_desc_t pack_elem; - dt_elem_desc_t* pElems; - - pElems = pConv->use_desc->desc; - - pStack = pConv->pStack + pConv->stack_pos; - destination = iov[0].iov_base; - source = (char*)pConv->pBaseBuf + pStack->disp; - - /* retrieve the context of the last call */ - pos_desc = pStack->index; - pack_elem.count = pStack->count; - pack_elem.common.type = pElems[pos_desc].elem.common.type; - last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; - lastDisp = pStack->disp; - DO_DEBUG( opal_output( 0, "pack_no_conversion stack_pos %d index %d count %d last_blength %ld lastDisp %ld bConverted %d\n", - pConv->stack_pos, pStack->index, pStack->count, last_blength, lastDisp, - pConv->bConverted ); ); - saveLength = 0; - pStack--; - pConv->stack_pos--; - - *freeAfter = (*freeAfter) & ~((1 << (*out_size)) - 1); - space_on_iovec = iov[0].iov_len; - - while( pos_desc >= 0 ) { - if( DT_END_LOOP == pElems[pos_desc].elem.common.type ) { /* end of the current loop */ - if( --(pStack->count) == 0 ) { /* end of loop */ - if( pConv->stack_pos == 0 ) { /* finish everything */ - if( saveLength != 0 ) { - /* there is still a chunk of memory to be handled, but here we dont allocate more - * memory. We just copy what we can in the right place and update the values to be - * saved on the next round. - */ - if( iov_pos < (*out_size) ) { /* still some place in the iovec */ - if( iov[iov_pos].iov_base == NULL ) { - /* prepare a new iovec */ - iov[iov_pos].iov_base = source; - iov[iov_pos].iov_len = saveLength; - bConverted += saveLength; - saveLength = 0; - iov_pos++; - space_on_iovec = 0; - COMPUTE_CSUM( iov[iov_pos].iov_base, iov[iov_pos].iov_len, pConv ); - /* let's go out of here */ - } else { - uint32_t copy_length = saveLength; - if( space_on_iovec < saveLength ) { - copy_length = space_on_iovec; - } - OMPI_DDT_SAFEGUARD_POINTER( source, copy_length, - pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "1. memcpy( %p, %p, %ld ) bConverted %ld space %ld pConv->bConverted %ld\n", destination, source, - copy_length, bConverted, space_on_iovec, pConv->bConverted ); ); - MEMCPY_CSUM( destination, source, copy_length, pConv ); - source += copy_length; - destination += copy_length; - bConverted += copy_length; - space_on_iovec -= copy_length; - saveLength -= copy_length; - } - } - } - iov[iov_pos].iov_len -= space_on_iovec; - pack_elem.count = 0; - pos_desc = -1; - last_blength = 0; - goto end_loop; - } - pConv->stack_pos--; - pStack--; - } else { - pos_desc = pStack->index; /* DT_LOOP index */ - if( pos_desc == -1 ) { - pStack->disp += (pData->ub - pData->lb); - } else { - assert( DT_LOOP == pElems[pos_desc].elem.common.type ); - pStack->disp += pElems[pos_desc].loop.extent; - } - } - pos_desc++; /* go to the next element */ - lastDisp = pStack->disp + pElems[pos_desc].elem.disp; - pack_elem.count = pElems[pos_desc].elem.count; - last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; - continue; /* next loop */ - } - while( DT_LOOP == pElems[pos_desc].elem.common.type ) { - int stop_in_loop = 0; - - /* If the loop container is contiguous then we can do some - * optimizations. - */ - if( pElems[pos_desc].loop.common.flags & DT_FLAG_CONTIGUOUS ) { - /* point to the end of loop element */ - ddt_endloop_desc_t* end_loop = &(pElems[pos_desc + pElems[pos_desc].loop.items].end_loop); - if( iov[iov_pos].iov_base == NULL ) { - size_t length = iov[iov_pos].iov_len; - iov[iov_pos].iov_base = pConv->memAlloc_fn( &length, pConv->memAlloc_userdata ); - iov[iov_pos].iov_len = length; - space_on_iovec = iov[iov_pos].iov_len; - destination = iov[iov_pos].iov_base; - (*freeAfter) |= (1 << iov_pos); - } - /* compute the maximum amount of data to be packed */ - if( (end_loop->size * pack_elem.count) > space_on_iovec ) { - stop_in_loop = pack_elem.count; - pack_elem.count = space_on_iovec / end_loop->size; - } - /* Now let's do it */ - for( i = 0; i < (int)pack_elem.count; i++ ) { - OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, end_loop->size, - pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG (opal_output( 0, "2. memcpy( %p, %p, %ld )\n", destination, pConv->pBaseBuf + lastDisp, - end_loop->size ); ); - MEMCPY_CSUM( destination, pConv->pBaseBuf + lastDisp, end_loop->size, pConv ); - lastDisp += pElems[pos_desc].loop.extent; - destination += end_loop->size; - } - DO_DEBUG( opal_output( 0, "\t\tbConverted %d space %d pConv->bConverted %d\n", - bConverted, space_on_iovec, pConv->bConverted ); ); - i = end_loop->size * pack_elem.count; /* temporary value */ - space_on_iovec -= i; - space -= i; - bConverted += i; - if( stop_in_loop == 0 ) { /* did I stop before the end */ - /* the pElems point to the LOOP struct in the begining */ - pos_desc += pElems[pos_desc].loop.items + 1; - pack_elem.count = pElems[pos_desc].elem.count; - last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; - lastDisp = pStack->disp + pElems[pos_desc].elem.disp; - continue; - } - /* mark some of the iterations as completed */ - pack_elem.count = stop_in_loop - pack_elem.count; - last_blength = 0; - /* Save the stack with the correct count value. */ - } - PUSH_STACK( pStack, pConv->stack_pos, pos_desc, DT_LOOP, pack_elem.count, - pStack->disp, pos_desc + pElems[pos_desc].loop.items ); - pos_desc++; - lastDisp = pStack->disp + pElems[pos_desc].elem.disp; - pack_elem.count = pElems[pos_desc].elem.count; - last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; - } - /* now here we have a basic datatype */ - while( pElems[pos_desc].elem.common.flags & DT_FLAG_DATA ) { - /* first let's see if it's contiguous with the previous chunk of memory and - * we still have enough room in the buffer... - */ - if( ((source + saveLength) == (pConv->pBaseBuf + lastDisp)) - && ((saveLength + last_blength) <= space_on_iovec) - && (pElems[pos_desc].elem.extent == (long)BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size) ) { - /* ok still contiguous and we still have some space on the buffer */ - saveLength += last_blength; - /* nothing else to do, we act the next time */ - } else { - /* Now we have 2 piece of non contiguous memory. One start at source - * with a length of saveLength, the other start at - * pConv->pBaseBuf + lastDisp with a length of last_blength bytes. - * First we have to pack the old buffer and then we should decide - * what we do with the new one. - */ - do { - if( iov[iov_pos].iov_base == NULL ) { - size_t length; - - if( saveLength > IOVEC_MEM_LIMIT ) { - /* If the user didn't provide any memory, then we are free - * to handle this case as we want. - */ - iov[iov_pos].iov_base = source; - iov[iov_pos].iov_len = saveLength; - source = pConv->pBaseBuf + lastDisp; - /* update the pack counters values */ - bConverted += saveLength; - space -= saveLength; - saveLength = last_blength; - last_blength = 0; - if( ++iov_pos == (*out_size) ) goto end_loop; - destination = iov[iov_pos].iov_base; - space_on_iovec = iov[iov_pos].iov_len; - break; - } - length = iov[iov_pos].iov_len; - /* Let's allocate some. */ - iov[iov_pos].iov_base = pConv->memAlloc_fn( &length, pConv->memAlloc_userdata ); - iov[iov_pos].iov_len = length; - (*freeAfter) |= (1 << iov_pos); - destination = iov[iov_pos].iov_base; - space_on_iovec = iov[iov_pos].iov_len; - } - /* In all the others cases we simply copy as much data as possible */ - if( space_on_iovec > saveLength ) { - OMPI_DDT_SAFEGUARD_POINTER( source, saveLength, - pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "3. memcpy( %p, %p, %ld ) bConverted %ld space %ld pConv->bConverted %ld\n", destination, source, - saveLength, bConverted, space_on_iovec, pConv->bConverted ); ); - MEMCPY_CSUM( destination, source, saveLength, pConv ); - destination += saveLength; - /* update the pack counters values */ - bConverted += saveLength; - space -= saveLength; - space_on_iovec -= saveLength; - source = pConv->pBaseBuf + lastDisp; - saveLength = last_blength; - last_blength = 0; - break; - } - OMPI_DDT_SAFEGUARD_POINTER( source, space_on_iovec, - pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( opal_output( 0, "4. memcpy( %p, %p, %ld ) bConverted %ld space %ld pConv->bConverted %ld\n", destination, source, - space_on_iovec, bConverted, space_on_iovec, pConv->bConverted ); ); - MEMCPY_CSUM( destination, source, space_on_iovec, pConv ); - /* let's prepare for the next round. As I keep trace of the amount that I still - * have to pack, the next time when I came here, I'll try to append something. - * If I already fill-up the amount of data required by the upper level, I will - * simply save all informations in the stack, if not I'll take care of allocating - * new memory and packing the data inside. - */ - source += space_on_iovec; - saveLength -= space_on_iovec; - /* update the pack counters values */ - bConverted += space_on_iovec; - space -= space_on_iovec; - lastDisp += space_on_iovec; - /* check for the next step */ - if( ++iov_pos == (*out_size) ) { /* are there more iovecs to fill ? */ - if( saveLength == 0 ) { - lastDisp -= space_on_iovec; - saveLength = last_blength; - last_blength = 0; - } - goto end_loop; - } - destination = iov[iov_pos].iov_base; - space_on_iovec = iov[iov_pos].iov_len; - } while(1); /* continue forever */ - } - - if( saveLength > space ) /* this will be the last element copied this time */ - continue; - pos_desc++; /* advance to the next data */ - lastDisp = pStack->disp + pElems[pos_desc].elem.disp; - pack_elem.count = pElems[pos_desc].elem.count; - last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; - } - } - end_loop: - assert( last_blength == 0 ); - pConv->bConverted += bConverted; /* update the byte converted field in the convertor */ - *max_data = bConverted; /* update the length in the iovec */ - if( ((*out_size) == iov_pos) || (iov[iov_pos].iov_base == NULL) ) *out_size = iov_pos; - else *out_size = iov_pos + 1; - if( pConv->bConverted == pConv->local_size ) { - pConv->flags |= CONVERTOR_COMPLETED; - return 1; - } - PUSH_STACK( pStack, pConv->stack_pos, pos_desc, pElems[pos_desc].elem.common.type, - saveLength, lastDisp, pos_desc ); - return 0; -} /* the contig versions does not use the stack. They can easily retrieve * the status with just the informations from pConvertor->bConverted. @@ -612,9 +60,11 @@ ompi_pack_homogeneous_contig_function( ompi_convertor_t* pConv, char *source_base = NULL; size_t length = pConv->local_size - pConv->bConverted; uint32_t iov_count, initial_amount = pConv->bConverted; - ddt_endloop_desc_t* _end_loop = &(pConv->use_desc->desc[pConv->use_desc->used].end_loop); + long initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; *freeAfter = 0; + source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp); + /* There are some optimizations that can be done if the upper level * does not provide a buffer. */ @@ -622,8 +72,6 @@ ompi_pack_homogeneous_contig_function( ompi_convertor_t* pConv, if( 0 == length ) break; if( (size_t)iov[iov_count].iov_len > length ) iov[iov_count].iov_len = length; - source_base = (pConv->pBaseBuf + _end_loop->first_elem_disp - + pStack[0].disp + pStack[1].disp); if( iov[iov_count].iov_base == NULL ) { iov[iov_count].iov_base = source_base; COMPUTE_CSUM( iov[iov_count].iov_base, iov[iov_count].iov_len, pConv ); @@ -636,6 +84,7 @@ ompi_pack_homogeneous_contig_function( ompi_convertor_t* pConv, length -= iov[iov_count].iov_len; pConv->bConverted += iov[iov_count].iov_len; pStack[0].disp += iov[iov_count].iov_len; + source_base += iov[iov_count].iov_len; } /* update the return value */ @@ -661,6 +110,7 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv, long extent; uint32_t max_allowed, i, index; uint32_t iov_count, total_bytes_converted = 0; + long initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; extent = pData->ub - pData->lb; assert( (pData->flags & DT_FLAG_CONTIGUOUS) && ((long)pData->size != extent) ); @@ -676,7 +126,7 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv, /* There are some optimizations that can be done if the upper level * does not provide a buffer. */ - user_memory = pConv->pBaseBuf + pData->true_lb + pStack[0].disp + pStack[1].disp; + user_memory = pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp; for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { if( 0 == max_allowed ) break; /* we're done this time */ if( iov[iov_count].iov_base == NULL ) { @@ -687,12 +137,13 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv, if( (uint32_t)pStack->count < ((*out_size) - iov_count) ) { pStack[1].count = pData->size - (pConv->bConverted % pData->size); for( index = iov_count; i < pConv->count; i++, index++ ) { - iov[index].iov_base = user_memory + pStack[0].disp + pStack[1].disp; + iov[index].iov_base = user_memory; iov[index].iov_len = pStack[1].count; pStack[0].disp += extent; total_bytes_converted += pStack[1].count; pStack[1].disp = 0; /* reset it for the next round */ pStack[1].count = pData->size; + user_memory = pConv->pBaseBuf + initial_displ + pStack[0].disp; COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv ); } *out_size = iov_count + index; @@ -760,7 +211,7 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv, i++; /* just to compute the correct source pointer */ total_bytes_converted += done; } - user_memory = pConv->pBaseBuf + pData->true_lb + i * extent; + user_memory = pConv->pBaseBuf + initial_displ + i * extent; counter = max_allowed / pData->size; if( counter > pConv->count ) counter = pConv->count; for( i = 0; i < counter; i++ ) { @@ -777,7 +228,7 @@ ompi_pack_homogeneous_contig_with_gaps_function( ompi_convertor_t* pConv, * the pStack[0].disp field. BEWARE here we remove the pStack[1].disp as * it's supposed to be useless from now. */ - user_memory = pConv->pBaseBuf + pData->true_lb + pStack[0].disp; + user_memory = pConv->pBaseBuf + initial_displ + pStack[0].disp; } *max_data = total_bytes_converted; pConv->bConverted += total_bytes_converted; diff --git a/ompi/datatype/datatype_prototypes.h b/ompi/datatype/datatype_prototypes.h index a773ef1b24..34c85ab583 100644 --- a/ompi/datatype/datatype_prototypes.h +++ b/ompi/datatype/datatype_prototypes.h @@ -15,30 +15,6 @@ #include "ompi_config.h" -OMPI_DECLSPEC int32_t -ompi_pack_general( ompi_convertor_t* pConvertor, - struct iovec* iov, uint32_t* out_size, - size_t* max_data, int32_t* freeAfter ); -OMPI_DECLSPEC int32_t -ompi_pack_general_checksum( ompi_convertor_t* pConvertor, - struct iovec* iov, uint32_t* out_size, - size_t* max_data, int32_t* freeAfter ); -OMPI_DECLSPEC int32_t -ompi_pack_homogeneous_with_memcpy( ompi_convertor_t* pConv, - struct iovec* iov, uint32_t* out_size, - size_t* max_data, int32_t* freeAfter ); -OMPI_DECLSPEC int32_t -ompi_pack_homogeneous_with_memcpy_checksum( ompi_convertor_t* pConv, - struct iovec* iov, uint32_t* out_size, - size_t* max_data, int32_t* freeAfter ); -int32_t -ompi_pack_no_conversion( ompi_convertor_t* pConv, - struct iovec* iov, uint32_t *out_size, - size_t* max_data, int32_t* freeAfter ); -int32_t -ompi_pack_no_conversion_checksum( ompi_convertor_t* pConv, - struct iovec* iov, uint32_t *out_size, - size_t* max_data, int32_t* freeAfter ); OMPI_DECLSPEC int32_t ompi_pack_homogeneous_contig( ompi_convertor_t* pConv, struct iovec* iov, uint32_t* out_size, diff --git a/ompi/datatype/datatype_unpack.c b/ompi/datatype/datatype_unpack.c index b6a7741aef..38c8af4399 100644 --- a/ompi/datatype/datatype_unpack.c +++ b/ompi/datatype/datatype_unpack.c @@ -70,7 +70,8 @@ ompi_unpack_general_function( ompi_convertor_t* pConvertor, int bConverted = 0; /* number of bytes converted this time */ const ompi_convertor_master_t* master = pConvertor->master; dt_elem_desc_t* pElems; - int oCount = (pConvertor->pDesc->ub - pConvertor->pDesc->lb) * pConvertor->count; + long extent = pConvertor->pDesc->ub - pConvertor->pDesc->lb; + int oCount = extent * pConvertor->count; char* pInput; int iCount, rc; uint32_t iov_count, total_bytes_converted = 0; @@ -104,7 +105,7 @@ ompi_unpack_general_function( ompi_convertor_t* pConvertor, } if( pStack->index == -1 ) { - pStack->disp += (pConvertor->pDesc->ub - pConvertor->pDesc->lb); + pStack->disp += extent; } else { assert( DT_LOOP == pElems[pStack->index].elem.common.type ); pStack->disp += pElems[pStack->index].loop.extent; @@ -171,6 +172,15 @@ ompi_unpack_general_function( ompi_convertor_t* pConvertor, return 0; } +/** + * This function will be used to unpack all datatypes that have the contiguous flag set. + * Several types of datatypes match this criterion, not only the contiguous one, but + * the ones that have gaps in the beginning and/or at the end but where the data to + * be unpacked is contiguous. However, this function only work for homogeneous cases + * and the datatype that are contiguous and where the extent is equal to the size are + * taken in account directly in the ompi_convertor_unpack function (in convertor.c) for + * the homogeneous case. + */ int32_t ompi_unpack_homogeneous_contig_function( ompi_convertor_t* pConv, struct iovec* iov, @@ -184,7 +194,7 @@ ompi_unpack_homogeneous_contig_function( ompi_convertor_t* pConv, long extent = pData->ub - pData->lb; uint32_t bConverted, length, remaining, i; dt_stack_t* stack = &(pConv->pStack[1]); - ddt_endloop_desc_t* _end_loop = &(pConv->use_desc->desc[pConv->use_desc->used].end_loop); + long initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp; for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { packed_buffer = (char*)iov[iov_count].iov_base; @@ -192,7 +202,7 @@ ompi_unpack_homogeneous_contig_function( ompi_convertor_t* pConv, if( remaining > (uint32_t)iov[iov_count].iov_len ) remaining = iov[iov_count].iov_len; bConverted = remaining; /* how much will get unpacked this time */ - user_memory = pConv->pBaseBuf + _end_loop->first_elem_disp; + user_memory = pConv->pBaseBuf + initial_displ; /*opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %d\n", user_memory, packed_buffer, remaining );*/ @@ -254,6 +264,18 @@ ompi_unpack_homogeneous_contig_function( ompi_convertor_t* pConv, return 0; } +/** + * This function handle partial types. Depending on the send operation it might happens + * that we receive only a partial type (always predefined type). In fact the outcome is + * that the unpack has to be done in 2 steps. As there is no way to know if the other + * part of the datatype is already received, we need to use a trick to handle this special + * case. The trick is to fill the missing part with some well known value, unpack the data + * as if it was completely received, and then move into the user memory only the bytes + * that don't match th wekk known value. This approach work as long as there is no need + * for more than structural changes. They will not work for cases where we will have to + * change the content of the data (as in all conversions that require changing the size + * of the exponent or mantissa). + */ static inline uint32_t ompi_unpack_partial_datatype( ompi_convertor_t* pConvertor, dt_elem_desc_t* pElem, char* partial_data, diff --git a/ompi/datatype/dt_add.c b/ompi/datatype/dt_add.c index 924ca93964..dc03462b6b 100644 --- a/ompi/datatype/dt_add.c +++ b/ompi/datatype/dt_add.c @@ -166,6 +166,7 @@ int32_t ompi_ddt_add( ompi_datatype_t* pdtBase, const ompi_datatype_t* pdtAdd, */ pdtBase->lb = lb; pdtBase->ub = ub; + if( 0 == pdtBase->nbElems ) old_true_ub = disp; else old_true_ub = pdtBase->true_ub; pdtBase->true_lb = LMIN( true_lb, pdtBase->true_lb );