diff --git a/src/datatype/Makefile.am b/src/datatype/Makefile.am index a7f56a8598..c4d8c06600 100644 --- a/src/datatype/Makefile.am +++ b/src/datatype/Makefile.am @@ -28,7 +28,7 @@ libdatatype_la_SOURCES = \ dt_add.c dt_create.c dt_create_array.c dt_create_dup.c dt_create_indexed.c \ dt_create_struct.c dt_create_vector.c dt_destroy.c dt_module.c \ dt_optimize.c dt_pack.c dt_sndrcv.c dt_unpack.c fake_stack.c dt_args.c \ - dt_arch.c dt_external32.c dt_match_size.c + dt_arch.c dt_external32.c dt_match_size.c new_pack.c # Conditionally install the header files diff --git a/src/datatype/datatype.h b/src/datatype/datatype.h index 459c64ec55..eb630e483e 100644 --- a/src/datatype/datatype.h +++ b/src/datatype/datatype.h @@ -207,7 +207,7 @@ struct ompi_convertor_t { uint32_t remoteArch; /**< the remote architecture */ uint32_t flags; /**< the properties of this convertor */ ompi_datatype_t* pDesc; /**< the datatype description associated with the convertor */ - dt_type_desc_t* use_desc; /**< the datatype version used by the convertor (normal or optimized) */ + const dt_type_desc_t* use_desc; /**< the datatype version used by the convertor (normal or optimized) */ uint32_t count; /**< the total number of full datatype elements */ char* pBaseBuf; /**< initial buffer as supplied by the user */ dt_stack_t* pStack; /**< the local stack for the actual conversion */ @@ -292,6 +292,8 @@ extern ompi_convertor_t* ompi_mpi_external32_convertor; /* and finally the convertor functions */ OMPI_DECLSPEC ompi_convertor_t* ompi_convertor_create( int32_t remote_arch, int32_t mode ); +OMPI_DECLSPEC int32_t ompi_convertor_set_start_position( ompi_convertor_t* convertor, + int32_t starting_pos ); OMPI_DECLSPEC int32_t ompi_convertor_init_for_send( ompi_convertor_t* pConv, uint32_t flags, const ompi_datatype_t* pData, int32_t count, const void* pUserBuf, int32_t local_starting_point, @@ -316,8 +318,8 @@ static inline int ompi_convertor_copy( const ompi_convertor_t* pSrcConv, ompi_co pDestConv->stack_size = DT_STATIC_STACK_SIZE; pDestConv->stack_pos = 0; pDestConv->pFunctions = pSrcConv->pFunctions; - - return OMPI_SUCCESS; + pDestConv->use_desc = pSrcConv->use_desc; + return OMPI_SUCCESS; } static inline ompi_convertor_t* ompi_convertor_get_copy( const ompi_convertor_t* pConvertor ) diff --git a/src/datatype/datatype_internal.h b/src/datatype/datatype_internal.h index 9a479f48bb..63a3829291 100644 --- a/src/datatype/datatype_internal.h +++ b/src/datatype/datatype_internal.h @@ -254,10 +254,25 @@ do { \ } while (0) #if OMPI_ENABLE_DEBUG -void ompi_ddt_safeguard_pointer( const void* actual_ptr, int length, const void* initial_ptr, - const ompi_datatype_t* pData, int count ); +OMPI_DECLSPEC int ompi_ddt_safeguard_pointer_debug_breakpoint( const void* actual_ptr, int length, + const void* initial_ptr, + const ompi_datatype_t* pData, + int count ); #define OMPI_DDT_SAFEGUARD_POINTER( ACTPTR, LENGTH, INITPTR, PDATA, COUNT ) \ - ompi_ddt_safeguard_pointer( (ACTPTR), (LENGTH), (INITPTR), (PDATA), (COUNT) ) +{ \ + char *__lower_bound = (char*)(INITPTR), *__upper_bound; \ + assert( ((LENGTH) != 0) && ((COUNT) != 0) ); \ + __lower_bound += (PDATA)->true_lb; \ + __upper_bound = (INITPTR) + ((PDATA)->ub - (PDATA)->lb) * ((COUNT) - 1) + (PDATA)->true_ub; \ + if( ((ACTPTR) < __lower_bound) || ((ACTPTR) >= __upper_bound) ) { \ + ompi_output( 0, "%s:%d\n\tPointer %p size %d is outside [%p,%p] for\n\tbase ptr %p count %d and data \n", \ + __FILE__, __LINE__, (ACTPTR), (LENGTH), __lower_bound, __upper_bound, \ + (INITPTR), (COUNT) ); \ + ompi_ddt_dump( (PDATA) ); \ + ompi_ddt_safeguard_pointer_debug_breakpoint( (ACTPTR), (LENGTH), (INITPTR), (PDATA), (COUNT) ); \ + } \ +} + #else #define OMPI_DDT_SAFEGUARD_POINTER( ACTPTR, LENGTH, INITPTR, PDATA, COUNT ) #endif /* OMPI_ENABLE_DEBUG */ @@ -322,13 +337,9 @@ int ompi_convertor_create_stack_with_pos_contig( ompi_convertor_t* pConvertor, pStack[0].count = pConvertor->count; pStack[0].index = -1; - if( pData->opt_desc.desc != NULL ) { - pElems = pData->opt_desc.desc; - pStack[0].end_loop = pData->opt_desc.used; - } else { - pElems = pData->desc.desc; - pStack[0].end_loop = pData->desc.used; - } + + pElems = pConvertor->use_desc->desc; + pStack[0].end_loop = pConvertor->use_desc->used; /* Special case for contiguous datatypes */ if( pData->size == 0 ) { /* special case for empty datatypes */ @@ -359,7 +370,6 @@ int ompi_convertor_create_stack_with_pos_contig( ompi_convertor_t* pConvertor, static inline int ompi_convertor_create_stack_at_begining( ompi_convertor_t* pConvertor, const int* sizes ) { - ompi_datatype_t* pData = pConvertor->pDesc; dt_stack_t* pStack; dt_elem_desc_t* pElems; int index = 0; @@ -375,14 +385,9 @@ int ompi_convertor_create_stack_at_begining( ompi_convertor_t* pConvertor, const pConvertor->pStack[0].disp = 0; /* first here we should select which data representation will be used for * this operation: normal one or the optimized version ? */ - pElems = pData->desc.desc; - pStack[0].end_loop = pData->desc.used; - if( pConvertor->flags & CONVERTOR_HOMOGENEOUS ) { - if( pData->opt_desc.used > 0 ) { - pElems = pData->opt_desc.desc; - pConvertor->pStack[0].end_loop = pData->opt_desc.used; - } - } + pElems = pConvertor->use_desc->desc; + pStack[0].end_loop = pConvertor->use_desc->used; + /* In the case where the datatype start with loops, we should push them on the stack. * Otherwise when we reach the end_loop field we will pop too many entries and finish * by overriding other places in memory. Now the big question is when to stop creating @@ -407,7 +412,7 @@ int ompi_convertor_create_stack_at_begining( ompi_convertor_t* pConvertor, const static inline void convertor_init_generic( ompi_convertor_t* pConv, const ompi_datatype_t* datatype, int count, - const void* pUserBuf ) + const void* pUserBuf ) { uint32_t required_stack_length = datatype->btypes[DT_LOOP] + 3; @@ -418,11 +423,17 @@ convertor_init_generic( ompi_convertor_t* pConv, const ompi_datatype_t* datatype if( pConv->stack_size > DT_STATIC_STACK_SIZE ) free( pConv->pStack ); } - pConv->pStack = pConv->static_stack; + pConv->pStack = pConv->static_stack; pConv->stack_size = DT_STATIC_STACK_SIZE; - pConv->bConverted = 0; + /* Decide which data representation will be used for the conversion. */ + if( (NULL != datatype->opt_desc.desc) && (pConv->flags & CONVERTOR_HOMOGENEOUS) ) { + pConv->use_desc = &(datatype->opt_desc); + } else { + pConv->use_desc = &(datatype->desc); + } + pConv->bConverted = 0; /* reset the convertor */ } - if( required_stack_length > pConv->stack_size ) { + if( DT_STATIC_STACK_SIZE < required_stack_length ) { pConv->stack_size = required_stack_length; pConv->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * pConv->stack_size ); } diff --git a/src/datatype/dt_optimize.c b/src/datatype/dt_optimize.c index 756d30c361..c97692ff65 100644 --- a/src/datatype/dt_optimize.c +++ b/src/datatype/dt_optimize.c @@ -25,38 +25,44 @@ #include int32_t ompi_ddt_optimize_short( ompi_datatype_t* pData, int32_t count, - dt_type_desc_t* pTypeDesc ) + dt_type_desc_t* pTypeDesc ) { dt_elem_desc_t* pElemDesc; - long lastDisp = 0; - dt_stack_t* pStack; /* pointer to the position on the stack */ - int32_t pos_desc; /* actual position in the description of the derived datatype */ - int32_t stack_pos = 0; - int32_t type, lastLength = 0, nbElems = 0, changes = 0, lastExtent = 1; - long totalDisp; + long last_disp = 0; + dt_stack_t* pStack; /* pointer to the position on the stack */ + int32_t pos_desc = 0; /* actual position in the description of the derived datatype */ + int32_t stack_pos = 0, last_type = DT_BYTE; + int32_t type = DT_BYTE, last_length = 0, nbElems = 0, changes = 0, last_extent = 1; + uint16_t last_flags = 0xFFFF; /* keep all for the first datatype */ + long total_disp; + int32_t optimized = 0; + + /* Contiguous datatypes does not have to get optimized */ + assert( (pData->flags & DT_FLAG_CONTIGUOUS) == 0 ); + + /* If there is no datatype description how can we have an optimized description ? */ + if( (count == 0) || (pData->desc.used == 0) ) { + pTypeDesc->length = 0; + pTypeDesc->desc = NULL; + pTypeDesc->used = 0; + return 1; + } + + pStack = alloca( sizeof(dt_stack_t) * (pData->btypes[DT_LOOP]+2) ); + total_disp = 0; pTypeDesc->length = 2 * pData->desc.used + 1 /* for the fake DT_END_LOOP at the end */; pTypeDesc->desc = pElemDesc = (dt_elem_desc_t*)malloc( sizeof(dt_elem_desc_t) * pTypeDesc->length ); pTypeDesc->used = 0; - if( (count == 0) || (pData->desc.used == 0) ) return 1; - - pStack = alloca( sizeof(dt_stack_t) * (pData->btypes[DT_LOOP]+2) ); - pStack->count = count; - pStack->index = -1; - pStack->end_loop = pData->desc.used; - pStack->disp = 0; - pos_desc = 0; - totalDisp = 0; - while( stack_pos >= 0 ) { if( DT_END_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { /* end of the current loop */ ddt_endloop_desc_t* end_loop = &(pData->desc.desc[pos_desc].end_loop); - if( lastLength != 0 ) { - CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, lastLength, lastDisp, lastExtent ); + if( last_length != 0 ) { + CREATE_ELEM( pElemDesc, last_type, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; - lastDisp += lastLength; - lastLength = 0; + last_disp += last_length; + last_length = 0; } CREATE_LOOP_END( pElemDesc, nbElems - pStack->index + 1, /* # of elems in this loop */ end_loop->total_extent, end_loop->size, end_loop->common.flags ); @@ -64,7 +70,7 @@ int32_t ompi_ddt_optimize_short( ompi_datatype_t* pData, int32_t count, if( --stack_pos >= 0 ) { /* still something to do ? */ ddt_loop_desc_t* pStartLoop = &(pTypeDesc->desc[pStack->index - 1].loop); pStartLoop->items = (pElemDesc - 1)->elem.count; - totalDisp = pStack->disp; /* update the displacement position */ + total_disp = pStack->disp; /* update the displacement position */ } pStack--; /* go down one position on the stack */ pos_desc++; @@ -80,77 +86,93 @@ int32_t ompi_ddt_optimize_short( ompi_datatype_t* pData, int32_t count, /* the loop is contiguous or composed by contiguous elements with a gap */ if( loop->extent == (long)end_loop->size ) { /* the whole loop is contiguous */ - if( (lastDisp + lastLength) != (totalDisp + loop_disp) ) { - CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, lastLength, lastDisp, lastExtent ); + if( (last_disp + last_length) != (total_disp + loop_disp) ) { + CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; - lastLength = 0; - lastDisp = totalDisp + loop_disp; + last_length = 0; + last_disp = total_disp + loop_disp; } - lastLength += loop->loops * end_loop->size; + last_length += loop->loops * end_loop->size; + optimized++; } else { int counter = loop->loops; /* if the previous data is contiguous with this piece and it has a length not ZERO */ - if( lastLength != 0 ) { - if( (lastDisp + lastLength) == (totalDisp + loop_disp) ) { - lastLength += end_loop->size; + if( last_length != 0 ) { + if( (last_disp + last_length) == (total_disp + loop_disp) ) { + last_length += end_loop->size; counter--; } - CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, lastLength, lastDisp, lastExtent ); + CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; - lastDisp += lastLength; - lastLength = 0; + last_disp += last_length; + last_length = 0; } /* we have a gap in the begining or the end of the loop but the whole * loop can be merged in just one memcpy. */ CREATE_LOOP_START( pElemDesc, counter, (long)2, loop->extent, loop->common.flags ); pElemDesc++; nbElems++; - CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, end_loop->size, loop_disp, lastExtent ); + CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, end_loop->size, loop_disp, last_extent ); pElemDesc++; nbElems++; CREATE_LOOP_END( pElemDesc, 2, end_loop->total_extent, end_loop->size, end_loop->common.flags ); pElemDesc++; nbElems++; + if( loop->items > 2 ) optimized++; } pos_desc += pData->desc.desc[pos_desc].loop.items + 1; changes++; } else { - if( lastLength != 0 ) { - CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, lastLength, lastDisp, lastExtent ); + if( last_length != 0 ) { + CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; - lastDisp += lastLength; - lastLength = 0; + last_disp += last_length; + last_length = 0; } CREATE_LOOP_START( pElemDesc, loop->loops, loop->items, loop->extent, loop->common.flags ); pElemDesc++; nbElems++; - PUSH_STACK( pStack, stack_pos, nbElems, DT_LOOP, loop->loops, totalDisp, pos_desc + loop->extent ); + PUSH_STACK( pStack, stack_pos, nbElems, DT_LOOP, loop->loops, total_disp, pos_desc + loop->extent ); pos_desc++; DDT_DUMP_STACK( pStack, stack_pos, pData->desc.desc, "advance loops" ); } - totalDisp = pStack->disp; /* update the displacement */ + total_disp = pStack->disp; /* update the displacement */ continue; } while( pData->desc.desc[pos_desc].elem.common.flags & DT_FLAG_DATA ) { /* keep doing it until we reach a non datatype element */ /* now here we have a basic datatype */ type = pData->desc.desc[pos_desc].elem.common.type; + if( (pData->desc.desc[pos_desc].elem.common.flags & DT_FLAG_CONTIGUOUS) && - (lastDisp + lastLength) == (totalDisp + pData->desc.desc[pos_desc].elem.disp) ) { - lastLength += pData->desc.desc[pos_desc].elem.count * ompi_ddt_basicDatatypes[type]->size; - lastExtent = 1; + (last_disp + last_length) == (total_disp + pData->desc.desc[pos_desc].elem.disp) && + (pData->desc.desc[pos_desc].elem.extent == (int32_t)ompi_ddt_basicDatatypes[type]->size) ) { + if( type == last_type ) { + last_length += pData->desc.desc[pos_desc].elem.count; + } else { + if( last_length == 0 ) { + last_type = type; + last_length = pData->desc.desc[pos_desc].elem.count; + } else { + last_length = last_length * ompi_ddt_basicDatatypes[last_type]->size + + pData->desc.desc[pos_desc].elem.count * ompi_ddt_basicDatatypes[type]->size; + last_type = DT_BYTE; + optimized++; + } + } + last_flags &= pData->desc.desc[pos_desc].elem.common.flags; } else { - if( lastLength != 0 ) { - CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, lastLength, lastDisp, lastExtent ); + if( last_length != 0 ) { + CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; } - lastDisp = totalDisp + pData->desc.desc[pos_desc].elem.disp; - lastLength = pData->desc.desc[pos_desc].elem.count * ompi_ddt_basicDatatypes[type]->size; - lastExtent = 1; + last_disp = total_disp + pData->desc.desc[pos_desc].elem.disp; + last_length = pData->desc.desc[pos_desc].elem.count * ompi_ddt_basicDatatypes[type]->size; + last_extent = 1; } pos_desc++; /* advance to the next data */ } } - if( lastLength != 0 ) { - CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, lastLength, lastDisp, lastExtent ); + if( last_length != 0 ) { + CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; } /* cleanup the stack */ @@ -173,7 +195,7 @@ static int ompi_ddt_unroll( ompi_datatype_t* pData, int count ) int type; /* type at current position */ int i; /* index for basic elements with extent */ int stack_pos = 0; /* position on the stack */ - long lastDisp = 0, lastLength = 0; + long last_disp = 0, last_length = 0; char* pDestBuf; int bConverted = 0, __index = 0, __sofar = 0; dt_elem_desc_t* pElems; @@ -239,20 +261,20 @@ static int ompi_ddt_unroll( ompi_datatype_t* pData, int count ) if( DT_LOOP == pElems[pos_desc].type ) { if( pElems[pos_desc].flags & DT_FLAG_CONTIGUOUS ) { dt_elem_desc_t* pLast = &( pElems[pos_desc + pElems[pos_desc].disp]); - if( (lastDisp + lastLength) == (pStack->disp + pElems[pos_desc+1].disp) ) { - PRINT_MEMCPY( pDestBuf, (char*)lastDisp, lastLength + pLast->extent ); - lastDisp = pStack->disp + pElems[pos_desc+1].disp + pLast->extent; + if( (last_disp + last_length) == (pStack->disp + pElems[pos_desc+1].disp) ) { + PRINT_MEMCPY( pDestBuf, (char*)last_disp, last_length + pLast->extent ); + last_disp = pStack->disp + pElems[pos_desc+1].disp + pLast->extent; i = 1; } else { - PRINT_MEMCPY( pDestBuf, (char*)lastDisp, lastLength ); - lastDisp = pStack->disp + pElems[pos_desc + 1].disp; + PRINT_MEMCPY( pDestBuf, (char*)last_disp, last_length ); + last_disp = pStack->disp + pElems[pos_desc + 1].disp; i = 0; } - lastLength = pLast->extent; + last_length = pLast->extent; for( ; i < (pElems[pos_desc].count - 1); i++ ) { - PRINT_MEMCPY( pDestBuf, (char*)lastDisp, lastLength ); + PRINT_MEMCPY( pDestBuf, (char*)last_disp, last_length ); pDestBuf += pLast->extent; - lastDisp += pElems[pos_desc].extent; + last_disp += pElems[pos_desc].extent; } pos_desc += pElems[pos_desc].disp + 1; goto next_loop; @@ -266,18 +288,18 @@ static int ompi_ddt_unroll( ompi_datatype_t* pData, int count ) } /* now here we have a basic datatype */ type = pElems[pos_desc].type; - if( (lastDisp + lastLength) == (pStack->disp + pElems[pos_desc].disp) ) { - lastLength += pElems[pos_desc].count * ompi_ddt_basicDatatypes[type]->size; + if( (last_disp + last_length) == (pStack->disp + pElems[pos_desc].disp) ) { + last_length += pElems[pos_desc].count * ompi_ddt_basicDatatypes[type]->size; } else { - PRINT_MEMCPY( pDestBuf, (char*)lastDisp, lastLength ); - pDestBuf += lastLength; - bConverted += lastLength; - lastDisp = pStack->disp + pElems[pos_desc].disp; - lastLength = pElems[pos_desc].count * ompi_ddt_basicDatatypes[type]->size; + PRINT_MEMCPY( pDestBuf, (char*)last_disp, last_length ); + pDestBuf += last_length; + bConverted += last_length; + last_disp = pStack->disp + pElems[pos_desc].disp; + last_length = pElems[pos_desc].count * ompi_ddt_basicDatatypes[type]->size; } pos_desc++; /* advance to the next data */ } - PRINT_MEMCPY( pDestBuf, (char*)lastDisp, lastLength ); + PRINT_MEMCPY( pDestBuf, (char*)last_disp, last_length ); return OMPI_SUCCESS; } #endif /* COMPILE_USELSS_CODE */ @@ -302,15 +324,17 @@ int32_t ompi_ddt_commit( ompi_datatype_t** data ) /* If the data is contiguous is useless to generate an optimized version. */ if( (long)pData->size != (pData->true_ub - pData->true_lb) ) { (void)ompi_ddt_optimize_short( pData, 1, &(pData->opt_desc) ); - /* let's add a fake element at the end just to avoid useless comparaisons - * in pack/unpack functions. - */ - pLast = &(pData->opt_desc.desc[pData->opt_desc.used].end_loop); - pLast->common.type = DT_END_LOOP; - pLast->common.flags = 0; - pLast->items = pData->opt_desc.used; - pLast->total_extent = pData->ub - pData->lb; - pLast->size = pData->size; + if( 0 < pData->opt_desc.used ) { + /* let's add a fake element at the end just to avoid useless comparaisons + * in pack/unpack functions. + */ + pLast = &(pData->opt_desc.desc[pData->opt_desc.used].end_loop); + pLast->common.type = DT_END_LOOP; + pLast->common.flags = 0; + pLast->items = pData->opt_desc.used; + pLast->total_extent = pData->ub - pData->lb; + pLast->size = pData->size; + } } return OMPI_SUCCESS; } diff --git a/src/datatype/dt_pack.c b/src/datatype/dt_pack.c index 9245a23ff9..5b06660513 100644 --- a/src/datatype/dt_pack.c +++ b/src/datatype/dt_pack.c @@ -27,26 +27,15 @@ #define DO_DEBUG(INST) -void ompi_ddt_safeguard_pointer( const void* actual_ptr, int length, - const void* initial_ptr, - const ompi_datatype_t* pData, - int count ) +#if OMPI_ENABLE_DEBUG +int ompi_ddt_safeguard_pointer_debug_breakpoint( const void* actual_ptr, int length, + const void* initial_ptr, + const ompi_datatype_t* pData, + int count ) { - char* lower_bound = (char*)initial_ptr; - char* upper_bound = (char*)initial_ptr; - - if( (length == 0) || (count == 0) ) return; - lower_bound += pData->lb; - upper_bound += (pData->ub - pData->lb) * (count - 1) + pData->true_ub; - - if( (char*)actual_ptr >= lower_bound ) - /* Im up from the lower bound */ - if( ((char*)actual_ptr + length) <= upper_bound ) - return; - ompi_output( 0, "Pointer %p size %d is outside [%p,%p] for %d times the data \n", - actual_ptr, length, lower_bound, upper_bound, count ); - ompi_ddt_dump( pData ); + return 0; } +#endif /* OMPI_ENABLE_DEBUG */ static int ompi_convertor_pack_general( ompi_convertor_t* pConvertor, @@ -195,12 +184,8 @@ int ompi_convertor_pack_homogeneous_with_memcpy( ompi_convertor_t* pConv, dt_elem_desc_t* pElems; pDestBuf = iov[0].iov_base; - - if( pData->opt_desc.desc != NULL ) { - pElems = pData->opt_desc.desc; - } else { - pElems = pData->desc.desc; - } + + pElems = pConv->use_desc->desc; pStack = pConv->pStack + pConv->stack_pos; pos_desc = pStack->index; @@ -329,27 +314,26 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, uint32_t iov_pos = 0; /* index in the iovec where we put data inside */ int bConverted = 0; /* number of bytes converted/moved this time */ uint32_t space_on_iovec; /* amount of free space on the current iovec */ - long lastDisp = 0, last_count = 0; + long lastDisp = 0; uint32_t space = *max_data, last_blength = 0, saveLength; - char *pDestBuf, *savePos; + char *destination, *source; ompi_datatype_t* pData = pConv->pDesc; + ddt_elem_desc_t pack_elem; dt_elem_desc_t* pElems; - if( pData->opt_desc.desc != NULL ) { - pElems = pData->opt_desc.desc; - } else { - pElems = pData->desc.desc; - } + pElems = pConv->use_desc->desc; - pDestBuf = iov[0].iov_base; pStack = pConv->pStack + pConv->stack_pos; + destination = iov[0].iov_base; + source = (char*)pConv->pBaseBuf + pStack->disp; + /* retrieve the context of the last call */ pos_desc = pStack->index; - last_count = pStack->count; - last_blength = last_count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; + pack_elem.count = pStack->count; + pack_elem.common.type = pElems[pos_desc].elem.common.type; + last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; lastDisp = pStack->disp; - savePos = (char*)pConv->pBaseBuf + pStack->disp; DO_DEBUG( ompi_output( 0, "pack_no_conversion stack_pos %d index %d count %d last_blength %ld lastDisp %ld savePos %p bConverted %d\n", pConv->stack_pos, pStack->index, pStack->count, last_blength, lastDisp, savePos, pConv->bConverted ); ); @@ -357,7 +341,7 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, pStack--; pConv->stack_pos--; - *freeAfter = 0; + *freeAfter = (*freeAfter) & ~((1 << (*out_size)) - 1); space_on_iovec = iov[0].iov_len; while( pos_desc >= 0 ) { @@ -372,7 +356,7 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, if( iov_pos < (*out_size) ) { /* still some place in the iovec */ if( iov[iov_pos].iov_base == NULL ) { /* prepare a new iovec */ - iov[iov_pos].iov_base = savePos; + iov[iov_pos].iov_base = source; iov[iov_pos].iov_len = saveLength; bConverted += saveLength; saveLength = 0; @@ -384,13 +368,13 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, if( space_on_iovec < saveLength ) { copy_length = space_on_iovec; } - OMPI_DDT_SAFEGUARD_POINTER( savePos, copy_length, + OMPI_DDT_SAFEGUARD_POINTER( source, copy_length, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( ompi_output( 0, "1. memcpy( %p, %p, %d ) bConverted %d space %d pConv->bConverted %d\n", pDestBuf, savePos, + DO_DEBUG( ompi_output( 0, "1. memcpy( %p, %p, %ld ) bConverted %ld space %ld pConv->bConverted %ld\n", destination, source, copy_length, bConverted, space_on_iovec, pConv->bConverted ); ); - MEMCPY( pDestBuf, savePos, copy_length ); - savePos += copy_length; - pDestBuf += copy_length; + MEMCPY( destination, source, copy_length ); + source += copy_length; + destination += copy_length; bConverted += copy_length; space_on_iovec -= copy_length; saveLength -= copy_length; @@ -398,7 +382,7 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, } } iov[iov_pos].iov_len -= space_on_iovec; - last_count = 0; + pack_elem.count = 0; pos_desc = -1; last_blength = 0; goto end_loop; @@ -416,8 +400,8 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, } pos_desc++; /* go to the next element */ lastDisp = pStack->disp + pElems[pos_desc].elem.disp; - last_count = pElems[pos_desc].elem.count; - last_blength = last_count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; + pack_elem.count = pElems[pos_desc].elem.count; + last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; continue; /* next loop */ } while( DT_LOOP == pElems[pos_desc].elem.common.type ) { @@ -432,62 +416,63 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, if( iov[iov_pos].iov_base == NULL ) { iov[iov_pos].iov_base = pConv->memAlloc_fn( &(iov[iov_pos].iov_len) ); space_on_iovec = iov[iov_pos].iov_len; - pDestBuf = iov[iov_pos].iov_base; + destination = iov[iov_pos].iov_base; (*freeAfter) |= (1 << iov_pos); } /* compute the maximum amount of data to be packed */ - if( (end_loop->size * last_count) > space_on_iovec ) { - stop_in_loop = last_count; - last_count = space_on_iovec / end_loop->size; + if( (end_loop->size * pack_elem.count) > space_on_iovec ) { + stop_in_loop = pack_elem.count; + pack_elem.count = space_on_iovec / end_loop->size; } /* Now let's do it */ - for( i = 0; i < last_count; i++ ) { + for( i = 0; i < (int)pack_elem.count; i++ ) { OMPI_DDT_SAFEGUARD_POINTER( pConv->pBaseBuf + lastDisp, end_loop->size, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG (ompi_output( 0, "2. memcpy( %p, %p, %d )\n", pDestBuf, pConv->pBaseBuf + lastDisp, + DO_DEBUG (ompi_output( 0, "2. memcpy( %p, %p, %ld )\n", destination, pConv->pBaseBuf + lastDisp, end_loop->size ); ); - MEMCPY( pDestBuf, pConv->pBaseBuf + lastDisp, end_loop->size ); + MEMCPY( destination, pConv->pBaseBuf + lastDisp, end_loop->size ); lastDisp += pElems[pos_desc].loop.extent; - pDestBuf += end_loop->size; + destination += end_loop->size; } DO_DEBUG( ompi_output( 0, "\t\tbConverted %d space %d pConv->bConverted %d\n", bConverted, space_on_iovec, pConv->bConverted ); ); - i = end_loop->size * last_count; /* temporary value */ + i = end_loop->size * pack_elem.count; /* temporary value */ space_on_iovec -= i; space -= i; bConverted += i; if( stop_in_loop == 0 ) { /* did I stop before the end */ /* the pElems point to the LOOP struct in the begining */ pos_desc += pElems[pos_desc].loop.items + 1; - last_count = pElems[pos_desc].elem.count; - last_blength = last_count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; + pack_elem.count = pElems[pos_desc].elem.count; + last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; lastDisp = pStack->disp + pElems[pos_desc].elem.disp; continue; } /* mark some of the iterations as completed */ - last_count = stop_in_loop - last_count; + pack_elem.count = stop_in_loop - pack_elem.count; last_blength = 0; - /* Save the stack with the correct last_count value. */ + /* Save the stack with the correct count value. */ } - PUSH_STACK( pStack, pConv->stack_pos, pos_desc, DT_LOOP, last_count, + PUSH_STACK( pStack, pConv->stack_pos, pos_desc, DT_LOOP, pack_elem.count, pStack->disp, pos_desc + pElems[pos_desc].loop.items ); pos_desc++; lastDisp = pStack->disp + pElems[pos_desc].elem.disp; - last_count = pElems[pos_desc].elem.count; - last_blength = last_count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; + pack_elem.count = pElems[pos_desc].elem.count; + last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; } /* now here we have a basic datatype */ while( pElems[pos_desc].elem.common.flags & DT_FLAG_DATA ) { /* first let's see if it's contiguous with the previous chunk of memory and * we still have enough room in the buffer... */ - if( ((savePos + saveLength) == (pConv->pBaseBuf + lastDisp)) - && ((saveLength + last_blength) <= space_on_iovec) ) { + if( ((source + saveLength) == (pConv->pBaseBuf + lastDisp)) + && ((saveLength + last_blength) <= space_on_iovec) + && (pElems[pos_desc].elem.extent == (long)BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size) ) { /* ok still contiguous and we still have some space on the buffer */ saveLength += last_blength; /* nothing else to do, we act the next time */ } else { - /* Now we have 2 piece of non contiguous memory. One start at savePos + /* Now we have 2 piece of non contiguous memory. One start at source * with a length of saveLength, the other start at * pConv->pBaseBuf + lastDisp with a length of last_blength bytes. * First we have to pack the old buffer and then we should decide @@ -499,54 +484,54 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, /* If the user didn't provide any memory, then we are free * to handle this case as we want. */ - iov[iov_pos].iov_base = savePos; + iov[iov_pos].iov_base = source; iov[iov_pos].iov_len = saveLength; - savePos = pConv->pBaseBuf + lastDisp; + source = pConv->pBaseBuf + lastDisp; /* update the pack counters values */ bConverted += saveLength; space -= saveLength; saveLength = last_blength; last_blength = 0; if( ++iov_pos == (*out_size) ) goto end_loop; - pDestBuf = iov[iov_pos].iov_base; + destination = iov[iov_pos].iov_base; space_on_iovec = iov[iov_pos].iov_len; break; } /* Let's allocate some. */ iov[iov_pos].iov_base = pConv->memAlloc_fn( &(iov[iov_pos].iov_len) ); (*freeAfter) |= (1 << iov_pos); - pDestBuf = iov[iov_pos].iov_base; + destination = iov[iov_pos].iov_base; space_on_iovec = iov[iov_pos].iov_len; } /* In all the others cases we simply copy as much data as possible */ if( space_on_iovec > saveLength ) { - OMPI_DDT_SAFEGUARD_POINTER( savePos, saveLength, + OMPI_DDT_SAFEGUARD_POINTER( source, saveLength, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( ompi_output( 0, "3. memcpy( %p, %p, %d ) bConverted %d space %d pConv->bConverted %d\n", pDestBuf, savePos, + DO_DEBUG( ompi_output( 0, "3. memcpy( %p, %p, %ld ) bConverted %ld space %ld pConv->bConverted %ld\n", destination, source, saveLength, bConverted, space_on_iovec, pConv->bConverted ); ); - MEMCPY( pDestBuf, savePos, saveLength ); - pDestBuf += saveLength; + MEMCPY( destination, source, saveLength ); + destination += saveLength; /* update the pack counters values */ bConverted += saveLength; space -= saveLength; space_on_iovec -= saveLength; - savePos = pConv->pBaseBuf + lastDisp; + source = pConv->pBaseBuf + lastDisp; saveLength = last_blength; last_blength = 0; break; } - OMPI_DDT_SAFEGUARD_POINTER( savePos, space_on_iovec, + OMPI_DDT_SAFEGUARD_POINTER( source, space_on_iovec, pConv->pBaseBuf, pData, pConv->count ); - DO_DEBUG( ompi_output( 0, "4. memcpy( %p, %p, %d ) bConverted %d space %d pConv->bConverted %d\n", pDestBuf, savePos, + DO_DEBUG( ompi_output( 0, "4. memcpy( %p, %p, %ld ) bConverted %ld space %ld pConv->bConverted %ld\n", destination, source, space_on_iovec, bConverted, space_on_iovec, pConv->bConverted ); ); - MEMCPY( pDestBuf, savePos, space_on_iovec ); + MEMCPY( destination, source, space_on_iovec ); /* let's prepare for the next round. As I keep trace of the amount that I still * have to pack, the next time when I came here, I'll try to append something. * If I already fill-up the amount of data required by the upper level, I will * simply save all informations in the stack, if not I'll take care of allocating * new memory and packing the data inside. */ - savePos += space_on_iovec; + source += space_on_iovec; saveLength -= space_on_iovec; /* update the pack counters values */ bConverted += space_on_iovec; @@ -561,7 +546,7 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, } goto end_loop; } - pDestBuf = iov[iov_pos].iov_base; + destination = iov[iov_pos].iov_base; space_on_iovec = iov[iov_pos].iov_len; } while(1); /* continue forever */ } @@ -570,8 +555,8 @@ int ompi_convertor_pack_no_conversion( ompi_convertor_t* pConv, continue; pos_desc++; /* advance to the next data */ lastDisp = pStack->disp + pElems[pos_desc].elem.disp; - last_count = pElems[pos_desc].elem.count; - last_blength = last_count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; + pack_elem.count = pElems[pos_desc].elem.count; + last_blength = pack_elem.count * BASIC_DDT_FROM_ELEM(pElems[pos_desc])->size; } } end_loop: @@ -637,7 +622,7 @@ ompi_convertor_pack_no_conv_contig( ompi_convertor_t* pConv, /* update the return value */ *max_data = pConv->bConverted - initial_amount; *out_size = iov_count; - return (pConv->bConverted == (pData->size * pConv->count)); + return (0 == length); } static int @@ -763,7 +748,22 @@ ompi_convertor_pack_no_conv_contig_with_gaps( ompi_convertor_t* pConv, return (pConv->bConverted == length); } +/* + * Set the starting position for a convertor. This function can be used at any + * moment in the life of a convertor to move the position to the desired point. + */ extern int ompi_ddt_local_sizes[DT_MAX_PREDEFINED]; +inline int32_t ompi_convertor_set_start_position( ompi_convertor_t* convertor, + int32_t starting_pos ) +{ + if( convertor->flags & DT_FLAG_CONTIGUOUS ) + return ompi_convertor_create_stack_with_pos_contig( convertor, starting_pos, ompi_ddt_local_sizes ); + if( starting_pos != 0 ) { + return ompi_convertor_create_stack_with_pos_general( convertor, starting_pos, ompi_ddt_local_sizes ); + } + return ompi_convertor_create_stack_at_begining( convertor, ompi_ddt_local_sizes ); +} + int32_t ompi_convertor_init_for_send( ompi_convertor_t* pConv, uint32_t flags, const ompi_datatype_t* datatype, @@ -776,29 +776,37 @@ int32_t ompi_convertor_init_for_send( ompi_convertor_t* pConv, /* this datatype is improper for conversion. Commit it first */ return OMPI_ERROR; } - assert( datatype != NULL ); + pConv->flags = CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS; /* by default set to homogeneous */ convertor_init_generic( pConv, datatype, count, pUserBuf ); - pConv->flags = CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS; /* by default set to homogeneous */ pConv->pFunctions = ompi_ddt_copy_functions; pConv->memAlloc_fn = allocfn; /* Just to avoid complaint from the compiler */ pConv->fAdvance = ompi_convertor_pack_general; pConv->fAdvance = ompi_convertor_pack_homogeneous_with_memcpy; + pConv->fAdvance = ompi_convertor_pack_no_conversion; + if( datatype->flags & DT_FLAG_CONTIGUOUS ) { pConv->flags |= DT_FLAG_CONTIGUOUS; - if( ((datatype->ub - datatype->lb) != (long)datatype->size) && (1 < pConv->count) ) /* gaps or no gaps */ - pConv->fAdvance = ompi_convertor_pack_no_conv_contig_with_gaps; - else + if( ((datatype->ub - datatype->lb) == (long)datatype->size) || (1 >= pConv->count) ) /* gaps or no gaps */ pConv->fAdvance = ompi_convertor_pack_no_conv_contig; - return ompi_convertor_create_stack_with_pos_contig( pConv, starting_pos, ompi_ddt_local_sizes ); + else + pConv->fAdvance = ompi_convertor_pack_no_conv_contig_with_gaps; } - pConv->fAdvance = ompi_convertor_pack_no_conversion; + if( -1 == starting_pos ) return OMPI_SUCCESS; - if( starting_pos != 0 ) { - return ompi_convertor_create_stack_with_pos_general( pConv, starting_pos, ompi_ddt_local_sizes ); + + /* dont call any function if the convertor is in the correct position */ + if( (pConv->bConverted == (unsigned long)starting_pos) && + (0 != starting_pos) ) return OMPI_SUCCESS; + + /* do we start after the end of the data ? */ + if( starting_pos >= (int)(pConv->count * datatype->size) ) { + pConv->bConverted = pConv->count * datatype->size; + return OMPI_SUCCESS; } - return ompi_convertor_create_stack_at_begining( pConv, ompi_ddt_local_sizes ); + + return ompi_convertor_set_start_position( pConv, starting_pos ); } #if OMPI_ENABLE_DEBUG @@ -833,6 +841,7 @@ ompi_convertor_t* ompi_convertor_create( int32_t remote_arch, int32_t mode ) static void ompi_convertor_construct( ompi_convertor_t* pConv ) { pConv->pDesc = NULL; + pConv->use_desc = NULL; pConv->pStack = pConv->static_stack; pConv->stack_size = DT_STATIC_STACK_SIZE; pConv->fAdvance = NULL; diff --git a/src/datatype/dt_unpack.c b/src/datatype/dt_unpack.c index 0bc07b2c81..41280f3cb8 100644 --- a/src/datatype/dt_unpack.c +++ b/src/datatype/dt_unpack.c @@ -183,11 +183,7 @@ static int ompi_convertor_unpack_homogeneous( ompi_convertor_t* pConv, pSrcBuf = iov[0].iov_base; - if( pData->opt_desc.desc != NULL ) { - pElems = pData->opt_desc.desc; - } else { - pElems = pData->desc.desc; - } + pElems = pConv->use_desc->desc; pStack = pConv->pStack + pConv->stack_pos; pos_desc = pStack->index; lastDisp = pStack->disp; @@ -630,31 +626,38 @@ int32_t ompi_convertor_need_buffers( ompi_convertor_t* pConvertor ) extern int ompi_ddt_local_sizes[DT_MAX_PREDEFINED]; int32_t ompi_convertor_init_for_recv( ompi_convertor_t* pConv, uint32_t flags, const ompi_datatype_t* datatype, int32_t count, - const void* pUserBuf, int32_t starting_point, + const void* pUserBuf, int32_t starting_pos, memalloc_fct_t allocfn ) { if( !(datatype->flags & DT_FLAG_COMMITED) ) { /* this datatype is improper for conversion. Commit it first */ return OMPI_ERROR; } - assert( datatype != NULL ); - convertor_init_generic( pConv, datatype, count, pUserBuf ); - pConv->flags = CONVERTOR_RECV | CONVERTOR_HOMOGENEOUS; + convertor_init_generic( pConv, datatype, count, pUserBuf ); pConv->pFunctions = ompi_ddt_copy_functions; + pConv->memAlloc_fn = allocfn; pConv->fAdvance = ompi_convertor_unpack_general; /* TODO: just stop complaining */ pConv->fAdvance = ompi_convertor_unpack_homogeneous; /* default behaviour */ - pConv->memAlloc_fn = allocfn; /* TODO: work only on homogeneous architectures */ if( datatype->flags & DT_FLAG_CONTIGUOUS ) { pConv->flags |= DT_FLAG_CONTIGUOUS; pConv->fAdvance = ompi_convertor_unpack_homogeneous_contig; } - if( -1 == starting_point ) return OMPI_SUCCESS; - if( starting_point != 0 ) - return ompi_convertor_create_stack_with_pos_general( pConv, starting_point, ompi_ddt_local_sizes ); - return ompi_convertor_create_stack_at_begining( pConv, ompi_ddt_local_sizes ); + + if( -1 == starting_pos ) return OMPI_SUCCESS; + + /* dont call any function if the convertor is in the correct position */ + if( (pConv->bConverted == (unsigned long)starting_pos) && + (0 != starting_pos) ) return OMPI_SUCCESS; + + if( starting_pos >= (int)(pConv->count * datatype->size) ) { + pConv->bConverted = pConv->count * datatype->size; + return OMPI_SUCCESS; + } + + return ompi_convertor_set_start_position( pConv, starting_pos ); } #if OMPI_ENABLE_DEBUG @@ -705,6 +708,7 @@ int32_t ompi_ddt_get_element_count( const ompi_datatype_t* datatype, int32_t iSi uint32_t pos_desc; /* actual position in the description of the derived datatype */ int rc, nbElems = 0; int stack_pos = 0; + dt_elem_desc_t* pElems; /* Normally the size should be less or equal to the size of the datatype. * This function does not support a iSize bigger than the size of the datatype. @@ -712,14 +716,15 @@ int32_t ompi_ddt_get_element_count( const ompi_datatype_t* datatype, int32_t iSi assert( (uint32_t)iSize <= datatype->size ); DUMP( "dt_count_elements( %p, %d )\n", (void*)datatype, iSize ); pStack = alloca( sizeof(dt_stack_t) * (datatype->btypes[DT_LOOP] + 2) ); - pStack->count = 1; - pStack->index = -1; + pStack->count = 1; + pStack->index = -1; + pStack->disp = 0; + pElems = datatype->desc.desc; pStack->end_loop = datatype->desc.used; - pStack->disp = 0; - pos_desc = 0; + pos_desc = 0; while( 1 ) { /* loop forever the exit conditionis on the last section */ - if( DT_END_LOOP == datatype->desc.desc[pos_desc].elem.common.type ) { /* end of the current loop */ + if( DT_END_LOOP == pElems[pos_desc].elem.common.type ) { /* end of the current loop */ if( --(pStack->count) == 0 ) { /* end of loop */ stack_pos--; pStack--; @@ -729,33 +734,33 @@ int32_t ompi_ddt_get_element_count( const ompi_datatype_t* datatype, int32_t iSi if( pStack->index == -1 ) { pStack->disp += (datatype->ub - datatype->lb); } else { - assert( DT_LOOP == datatype->desc.desc[pStack->index].elem.common.type ); - pStack->disp += datatype->desc.desc[pStack->index].loop.extent; + assert( DT_LOOP == pElems[pStack->index].elem.common.type ); + pStack->disp += pElems[pStack->index].loop.extent; } pos_desc = pStack->index + 1; continue; } - if( DT_LOOP == datatype->desc.desc[pos_desc].elem.common.type ) { - ddt_loop_desc_t* loop = &(datatype->desc.desc[pos_desc].loop); + if( DT_LOOP == pElems[pos_desc].elem.common.type ) { + ddt_loop_desc_t* loop = &(pElems[pos_desc].loop); do { PUSH_STACK( pStack, stack_pos, pos_desc, DT_LOOP, loop->loops, 0, pos_desc + loop->items ); pos_desc++; - } while( DT_LOOP == datatype->desc.desc[pos_desc].elem.common.type ); /* let's start another loop */ - DDT_DUMP_STACK( pStack, stack_pos, datatype->desc.desc, "advance loops" ); + } while( DT_LOOP == pElems[pos_desc].elem.common.type ); /* let's start another loop */ + DDT_DUMP_STACK( pStack, stack_pos, pElems, "advance loops" ); continue; } - while( datatype->desc.desc[pos_desc].elem.common.flags & DT_FLAG_DATA ) { + while( pElems[pos_desc].elem.common.flags & DT_FLAG_DATA ) { /* now here we have a basic datatype */ - const ompi_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(datatype->desc.desc[pos_desc]); - rc = datatype->desc.desc[pos_desc].elem.count * basic_type->size; + const ompi_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]); + rc = pElems[pos_desc].elem.count * basic_type->size; if( rc >= iSize ) { rc = iSize / basic_type->size; nbElems += rc; iSize -= rc * basic_type->size; return (iSize == 0 ? nbElems : -1); } - nbElems += datatype->desc.desc[pos_desc].elem.count; + nbElems += pElems[pos_desc].elem.count; iSize -= rc; pos_desc++; /* advance to the next data */ } diff --git a/src/datatype/fake_stack.c b/src/datatype/fake_stack.c index 500d493294..3e07a4aa54 100644 --- a/src/datatype/fake_stack.c +++ b/src/datatype/fake_stack.c @@ -53,17 +53,10 @@ int ompi_convertor_create_stack_with_pos_general( ompi_convertor_t* pConvertor, size_t remote_size; uint32_t count; - if( starting_point == 0 ) { - return ompi_convertor_create_stack_at_begining( pConvertor, sizes ); - } - /* if the convertor continue from the last position there is nothing to do. */ - if( pConvertor->bConverted == (unsigned long)starting_point ) return OMPI_SUCCESS; + assert( 0 != starting_point ); + assert( pConvertor->bConverted != (unsigned long)starting_point ); + assert( starting_point <= (int)(pConvertor->count * pData->size) ); - /* do we start after the end of the data ? */ - if( starting_point >= (int)(pConvertor->count * pData->size) ) { - pConvertor->bConverted = pConvertor->count * pData->size; - return OMPI_SUCCESS; - } /*ompi_output( 0, "Data extent %d size %d count %d total_size %d starting_point %d\n", pData->ub - pData->lb, pData->size, pConvertor->count, pData->size * pConvertor->count, starting_point );*/ @@ -73,13 +66,8 @@ int ompi_convertor_create_stack_with_pos_general( ompi_convertor_t* pConvertor, * last fake DT_END_LOOP that we add to the data representation and * allow us to move quickly inside the datatype when we have a count. */ - if( pData->opt_desc.desc != NULL ) { - pElems = pData->opt_desc.desc; - pStack[0].end_loop = pData->opt_desc.used; - } else { - pElems = pData->desc.desc; - pStack[0].end_loop = pData->desc.used; - } + pElems = pConvertor->use_desc->desc; + pStack->end_loop = pConvertor->use_desc->used; if( (pConvertor->flags & CONVERTOR_HOMOGENEOUS) && (pData->flags & DT_FLAG_CONTIGUOUS) ) { /* Special case for contiguous datatypes */ diff --git a/src/datatype/new_pack.c b/src/datatype/new_pack.c new file mode 100644 index 0000000000..21adc1268b --- /dev/null +++ b/src/datatype/new_pack.c @@ -0,0 +1,233 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "datatype/datatype.h" +#include "datatype/datatype_internal.h" + +#ifdef HAVE_ALLOCA_H +#include +#endif +#include + +/* The pack/unpack functions need a cleanup. I have to create a proper interface to access + * all basic functionalities, hence using them as basic blocks for all conversion functions. + * + * But first let's make some global assumptions: + * - a datatype (with the flag DT_DATA set) will have the contiguous flags set if and only if + * the data is really contiguous (extent equal with size) + * - for the DT_LOOP type the DT_CONTIGUOUS flag set means that the content of the loop is contiguous + * but with a gap in the begining or at the end. + * - the DT_CONTIGUOUS flag for the type DT_END_LOOP is meaningless. + */ + +#define PACK_PREDEFINED_DATATYPE( TYPE, /* the basic type to be packed */ \ + COUNT, /* the number of elements */ \ + EXTENT, /* the extent in bytes of each element */ \ + SOURCE, /* the source pointer (char*) */ \ + DESTINATION, /* the destination pointer (char*) */ \ + SPACE ) /* the space in the destination buffer */ \ +do { \ + int copy_count = (COUNT), copy_blength; \ + \ + if( (copy_count * ompi_ddt_basicDatatypes[(TYPE)]->size) > (SPACE) ) \ + copy_count = (SPACE) / ompi_ddt_basicDatatypes[type]->size; \ + copy_blength = copy_count * ompi_ddt_basicDatatypes[type]->size; \ + \ + if( ompi_ddt_basicDatatypes[type]->size == (EXTENT) ) { \ + memcpy( (DESTINATION), (SOURCE), copy_blength ); \ + (SOURCE) += copy_blength; \ + (DESTINATION) += copy_blength; \ + } else { \ + int i; \ + for( i = 0; i < copy_count; i++ ) { \ + memcpy( (DESTINATION), (SOURCE), ompi_ddt_basicDatatypes[type]->size ); \ + (DESTINATION) += ompi_ddt_basicDatatypes[type]->size; \ + (SOURCE) += (EXTENT); \ + } \ + } \ + (SPACE) -= copy_blength; \ + (COUNT) -= copy_count; \ +} while (0) + +#define PACK_CONTIGUOUS_LOOP( CONVERTOR, /* */ \ + ELEM, /* */ \ + COUNT, /* */ \ + SOURCE, /* */ \ + DESTINATION, /* */ \ + SPACE ) /* */ \ +do { \ + ddt_loop_desc_t *loop = (ddt_loop_desc_t*)(ELEM); \ + ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)((ELEM) + (ELEM).loop.items); \ + size_t copy_loops = (COUNT); \ + int i; \ +\ + if( (copy_loops * end_loop->size) > (SPACE) ) \ + copy_loops = (SPACE) / end_loop->size; \ + assert( loop->extent != end_loop->size ); \ + for( i = 0; i < copy_loops; i++ ) { \ + OMPI_DDT_SAFEGUARD_POINTER( (CONVERTOR)->pBaseBuf + lastDisp, end_loop->size, \ + (CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count ); \ + DO_DEBUG (ompi_output( 0, "2. memcpy( %p, %p, %ld )\n", pDestBuf, (CONVERTOR)->pBaseBuf + lastDisp, \ + end_loop->size ); ); \ + MEMCPY( pDestBuf, (CONVERTOR)->pBaseBuf + lastDisp, end_loop->size ); \ + lastDisp += loop->extent; \ + (DESTINATION) += end_loop->size; \ + (SOURCE) += loop->extent; \ + } \ + (SPACE) -= copy_count * end_loop->size; \ +} while (0) + +#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER, DISPLACEMENT ) \ +do { \ + (ELEMENT) = &((DESCRIPTION)[(POSITION)]); \ + (COUNTER) = (ELEMENT)->elem.count; \ + (DISPLACEMENT) = (ELEMENT)->elem.disp; \ +} while (0) + +int ompi_convertor_generic_simple_pack( ompi_convertor_t* pConvertor, + struct iovec* iov, uint32_t* out_size, + uint32_t* max_data, + int32_t* freeAfter ) +{ + dt_stack_t* pStack; /* pointer to the position on the stack */ + uint32_t pos_desc; /* actual position in the description of the derived datatype */ + uint32_t count_desc; /* the number of items already done in the actual pos_desc */ + uint16_t type; /* type at current position */ + long disp_desc = 0; /* compute displacement for truncated data */ + uint32_t bConverted = 0; /* number of bytes converted this time */ + dt_elem_desc_t* description; + dt_elem_desc_t* pElem; + ompi_datatype_t *pData = pConvertor->pDesc; + char* iov_base_local; + uint32_t iov_len_local, i, iov_count; + + DUMP( "ompi_convertor_generic_simple_pack( %p, {%p, %d}, %d )\n", (void*)pConvertor, + iov[0].iov_base, iov[0].iov_len, *out_size ); + + if( pConvertor->pDesc->opt_desc.used != 0 ) { + description = pConvertor->pDesc->opt_desc.desc; + } else { + description = pConvertor->pDesc->desc.desc; + } + /*description = pConvertor->use_desc->desc;*/ + + pStack = pConvertor->pStack + pConvertor->stack_pos; + pos_desc = pStack->index; + disp_desc = pStack->disp; + count_desc = pStack->count; + pStack--; + pConvertor->stack_pos--; + pElem =&(description[pos_desc]); + + for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { + if( iov[iov_count].iov_base == NULL ) { + /* + * ALLOCATE SOME MEMORY ... + */ + *freeAfter = (*freeAfter) | (1 << iov_count); + } + iov_base_local = iov[iov_count].iov_base; + iov_len_local = iov[iov_count].iov_len; + bConverted = 0; + while( 1 ) { + if( DT_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ + if( --(pStack->count) == 0 ) { /* end of loop */ + if( pConvertor->stack_pos == 0 ) + goto complete_loop; /* completed */ + pConvertor->stack_pos--; + pStack--; + pos_desc++; + } else { + pos_desc = pStack->index + 1; + if( pStack->index == -1 ) { + pStack->disp += (pData->ub - pData->lb); + } else { + assert( DT_LOOP == description[pStack->index].elem.common.type ); + pStack->disp += description[pStack->index].loop.extent; + } + } + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc, disp_desc ); + } + if( DT_LOOP == pElem->elem.common.type ) { + int stop_in_loop = 0; + if( pElem->loop.common.flags & DT_FLAG_CONTIGUOUS ) { + ddt_endloop_desc_t* end_loop = &(description[pos_desc + pElem->loop.items].end_loop); + if( (end_loop->size * count_desc) > iov_len_local ) { + stop_in_loop = count_desc; + count_desc = iov_len_local / end_loop->size; + } + for( i = 0; i < count_desc; i++ ) { + /* + * DO SOMETHING USEFULL ... + */ + iov_base_local += end_loop->size; /* size of the contiguous data */ + disp_desc += pElem->loop.extent; + } + iov_len_local -= (end_loop->size * count_desc); + bConverted += (end_loop->size * count_desc); + if( stop_in_loop == 0 ) { + pos_desc += pElem->loop.items + 1; + goto update_loop_description; + } + /* mark some of the iterations as completed */ + count_desc = stop_in_loop - count_desc; + /* Save the stack with the correct last_count value. */ + } + PUSH_STACK( pStack, pConvertor->stack_pos, + pos_desc, DT_LOOP, count_desc, + pStack->disp, pos_desc + pElem->elem.disp + 1); + pos_desc++; +update_loop_description: + /* update the current state */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc, disp_desc ); + DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" ); + continue; + } + while( pElem->elem.common.flags & DT_FLAG_DATA ) { + /* now here we have a basic datatype */ + type = pElem->elem.common.type; + if( pElem->elem.common.flags & DT_FLAG_CONTIGUOUS ) { + /* the extent and the size of the basic datatype are equals */ + /* + * DO SOMETHING USEFULL ... + */ + } else { + /* the extent and the size of the basic datatype are differents */ + /* + * DO SOMETHING USEFULL ... + */ + } + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc, disp_desc ); + } + } + complete_loop: + pConvertor->bConverted += bConverted; /* update the already converted bytes */ + assert( bConverted <= iov[iov_count].iov_len ); + iov[iov_count].iov_len = bConverted; /* update the length in the iovec */ + } + if( pConvertor->bConverted != (pData->size * pConvertor->count) ) { + /* I complete an element, next step I should go to the next one */ + PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, DT_BYTE, count_desc, + disp_desc, pos_desc ); + return 0; + } + return 1; +} +