diff --git a/ompi/datatype/dt_optimize.c b/ompi/datatype/dt_optimize.c index b98f80e1e8..f6535e2c18 100644 --- a/ompi/datatype/dt_optimize.c +++ b/ompi/datatype/dt_optimize.c @@ -27,17 +27,53 @@ #endif #include +#define SET_EMPTY_ELEMENT( ELEM ) \ + do { \ + ddt_elem_desc_t* _elem = (ELEM); \ + _elem->common.flags = DT_FLAG_BASIC; \ + _elem->common.type = DT_LOOP; \ + _elem->count = 0; \ + _elem->disp = 0; \ + _elem->extent = 0; \ + } while (0) + +static inline int SAVE_OPTIMIZED_ELEMENT( dt_elem_desc_t* pElemDesc, + ddt_elem_desc_t* opt_elem ) +{ + if( 0 != opt_elem->count ) { + pElemDesc->elem = *opt_elem; + SET_EMPTY_ELEMENT( opt_elem ); + } + return 0; +} + +static inline int ADD_ELEMENT( dt_elem_desc_t* pElemDesc, + ddt_elem_desc_t* opt_elem, + uint16_t type, uint32_t count, long disp, int32_t extent ) +{ + if( 0 == opt_elem->count ) { + opt_elem->common.flags = DT_FLAG_BASIC; + opt_elem->common.type = type; + opt_elem->count = count; + opt_elem->disp = disp; + opt_elem->extent = extent; + return 0; + } + return 1; +} + static int32_t ompi_ddt_optimize_short( ompi_datatype_t* pData, int32_t count, dt_type_desc_t* pTypeDesc ) { dt_elem_desc_t* pElemDesc; + ddt_elem_desc_t opt_elem; long last_disp = 0; dt_stack_t* pStack; /* pointer to the position on the stack */ int32_t pos_desc = 0; /* actual position in the description of the derived datatype */ int32_t stack_pos = 0, last_type = DT_BYTE; - int32_t type = DT_BYTE, last_length = 0, nbElems = 0, changes = 0, last_extent = 1; + int32_t type = DT_LOOP, last_length = 0, nbElems = 0, changes = 0, last_extent = 1; uint16_t last_flags = 0xFFFF; /* keep all for the first datatype */ long total_disp = 0; int32_t optimized = 0; @@ -49,6 +85,10 @@ ompi_ddt_optimize_short( ompi_datatype_t* pData, pTypeDesc->desc = pElemDesc = (dt_elem_desc_t*)malloc( sizeof(dt_elem_desc_t) * pTypeDesc->length ); pTypeDesc->used = 0; + SET_EMPTY_ELEMENT( &opt_elem ); + assert( DT_END_LOOP == pData->desc.desc[pData->desc.used].elem.common.type ); + opt_elem.disp = pData->desc.desc[pData->desc.used].end_loop.first_elem_disp; + while( stack_pos >= 0 ) { if( DT_END_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { /* end of the current loop */ ddt_endloop_desc_t* end_loop = &(pData->desc.desc[pos_desc].end_loop); @@ -59,7 +99,7 @@ ompi_ddt_optimize_short( ompi_datatype_t* pData, last_length = 0; } CREATE_LOOP_END( pElemDesc, nbElems - pStack->index + 1, /* # of elems in this loop */ - end_loop->total_extent, end_loop->size, end_loop->common.flags ); + end_loop->first_elem_disp, end_loop->size, end_loop->common.flags ); pElemDesc++; nbElems++; if( --stack_pos >= 0 ) { /* still something to do ? */ ddt_loop_desc_t* pStartLoop = &(pTypeDesc->desc[pStack->index - 1].loop); @@ -80,7 +120,8 @@ ompi_ddt_optimize_short( ompi_datatype_t* pData, /* the loop is contiguous or composed by contiguous elements with a gap */ if( loop->extent == (long)end_loop->size ) { /* the whole loop is contiguous */ - if( (last_disp + last_length) != (total_disp + loop_disp) ) { + if( (last_disp + last_length * (long)ompi_ddt_basicDatatypes[last_type]->size) + != (total_disp + loop_disp) ) { if( 0 != last_length ) { CREATE_ELEM( pElemDesc, last_type, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; @@ -88,7 +129,10 @@ ompi_ddt_optimize_short( ompi_datatype_t* pData, } last_disp = total_disp + loop_disp; } - last_length += loop->loops * end_loop->size; + last_length = last_length * ompi_ddt_basicDatatypes[last_type]->size + + loop->loops * end_loop->size; + last_type = DT_BYTE; + last_extent = 1; optimized++; } else { int counter = loop->loops; @@ -97,34 +141,37 @@ ompi_ddt_optimize_short( ompi_datatype_t* pData, if( (last_disp + last_length) == (total_disp + loop_disp) ) { last_length *= ompi_ddt_basicDatatypes[last_type]->size; last_length += end_loop->size; - last_type = DT_BYTE; + last_type = DT_BYTE; + last_extent = 1; counter--; } CREATE_ELEM( pElemDesc, last_type, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; last_disp += last_length; last_length = 0; + last_type = DT_LOOP; } /* we have a gap in the begining or the end of the loop but the whole * loop can be merged in just one memcpy. */ CREATE_LOOP_START( pElemDesc, counter, (long)2, loop->extent, loop->common.flags ); pElemDesc++; nbElems++; - CREATE_ELEM( pElemDesc, last_type, DT_FLAG_BASIC, end_loop->size, loop_disp, last_extent ); + CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, end_loop->size, loop_disp, 1); pElemDesc++; nbElems++; - CREATE_LOOP_END( pElemDesc, 2, end_loop->total_extent, end_loop->size, + CREATE_LOOP_END( pElemDesc, 2, end_loop->first_elem_disp, end_loop->size, end_loop->common.flags ); pElemDesc++; nbElems++; if( loop->items > 2 ) optimized++; } - pos_desc += pData->desc.desc[pos_desc].loop.items + 1; + pos_desc += loop->items + 1; changes++; } else { if( last_length != 0 ) { CREATE_ELEM( pElemDesc, last_type, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; - last_disp += last_length; + last_disp += last_length; last_length = 0; + last_type = DT_LOOP; } CREATE_LOOP_START( pElemDesc, loop->loops, loop->items, loop->extent, loop->common.flags ); pElemDesc++; nbElems++; @@ -174,7 +221,7 @@ ompi_ddt_optimize_short( ompi_datatype_t* pData, } if( last_length != 0 ) { - CREATE_ELEM( pElemDesc, DT_BYTE, DT_FLAG_BASIC, last_length, last_disp, last_extent ); + CREATE_ELEM( pElemDesc, last_type, DT_FLAG_BASIC, last_length, last_disp, last_extent ); pElemDesc++; nbElems++; } /* cleanup the stack */ @@ -186,18 +233,31 @@ int32_t ompi_ddt_commit( ompi_datatype_t** data ) { ompi_datatype_t* pData = *data; ddt_endloop_desc_t* pLast = &(pData->desc.desc[pData->desc.used].end_loop); + long first_elem_disp = 0; if( pData->flags & DT_FLAG_COMMITED ) return OMPI_SUCCESS; pData->flags |= DT_FLAG_COMMITED; + /* We have to compute the displacement of the first non loop item in the + * description. + */ + if( 0 != pData->size ) { + int index; + dt_elem_desc_t* pElem = pData->desc.desc; + + index = GET_FIRST_NON_LOOP( pElem ); + assert( pData->desc.desc[index].elem.common.flags & DT_FLAG_DATA ); + first_elem_disp = pData->desc.desc[index].elem.disp; + } + /* let's add a fake element at the end just to avoid useless comparaisons * in pack/unpack functions. */ - pLast->common.type = DT_END_LOOP; - pLast->common.flags = 0; - pLast->items = pData->desc.used; - pLast->total_extent = pData->ub - pData->lb; - pLast->size = pData->size; + pLast->common.type = DT_END_LOOP; + pLast->common.flags = 0; + pLast->items = pData->desc.used; + pLast->first_elem_disp = first_elem_disp; + pLast->size = pData->size; /* If there is no datatype description how can we have an optimized description ? */ if( 0 == pData->desc.used ) { @@ -216,11 +276,11 @@ int32_t ompi_ddt_commit( ompi_datatype_t** data ) * in pack/unpack functions. */ pLast = &(pData->opt_desc.desc[pData->opt_desc.used].end_loop); - pLast->common.type = DT_END_LOOP; - pLast->common.flags = 0; - pLast->items = pData->opt_desc.used; - pLast->total_extent = pData->ub - pData->lb; - pLast->size = pData->size; + pLast->common.type = DT_END_LOOP; + pLast->common.flags = 0; + pLast->items = pData->opt_desc.used; + pLast->first_elem_disp = first_elem_disp; + pLast->size = pData->size; } return OMPI_SUCCESS; }