1
1

Merge pull request #6695 from bosilca/fix/vector_stride_0

A big refresh of the datatype engine
Этот коммит содержится в:
bosilca 2019-07-23 15:20:14 -04:00 коммит произвёл GitHub
родитель 8f32a59304 aa17392309
Коммит 94f26f5a51
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
24 изменённых файлов: 1320 добавлений и 902 удалений

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/* /*
* Copyright (c) 2009-2013 The University of Tennessee and The University * Copyright (c) 2009-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.

Просмотреть файл

@ -87,10 +87,10 @@ int32_t ompi_datatype_create_hindexed( int count, const int* pBlockLength, const
return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType); return ompi_datatype_duplicate( &ompi_mpi_datatype_null.dt, newType);
} }
ompi_datatype_type_extent( oldType, &extent );
disp = pDisp[i]; disp = pDisp[i];
dLength = pBlockLength[i]; dLength = pBlockLength[i];
endat = disp + dLength * extent; endat = disp + dLength * extent;
ompi_datatype_type_extent( oldType, &extent );
pdt = ompi_datatype_create( (count - i) * (2 + oldType->super.desc.used) ); pdt = ompi_datatype_create( (count - i) * (2 + oldType->super.desc.used) );
for( i += 1; i < count; i++ ) { for( i += 1; i < count; i++ ) {
@ -162,17 +162,17 @@ int32_t ompi_datatype_create_hindexed_block( int count, int bLength, const ptrdi
pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) ); pdt = ompi_datatype_create( count * (2 + oldType->super.desc.used) );
disp = pDisp[0]; disp = pDisp[0];
dLength = bLength; dLength = bLength;
endat = disp + dLength; endat = disp + dLength * extent;
for( i = 1; i < count; i++ ) { for( i = 1; i < count; i++ ) {
if( endat == pDisp[i] ) { if( endat == pDisp[i] ) {
/* contiguous with the previsious */ /* contiguous with the previsious */
dLength += bLength; dLength += bLength;
endat += bLength; endat += bLength * extent;
} else { } else {
ompi_datatype_add( pdt, oldType, dLength, disp, extent ); ompi_datatype_add( pdt, oldType, dLength, disp, extent );
disp = pDisp[i]; disp = pDisp[i];
dLength = bLength; dLength = bLength;
endat = disp + bLength; endat = disp + bLength * extent;
} }
} }
ompi_datatype_add( pdt, oldType, dLength, disp, extent ); ompi_datatype_add( pdt, oldType, dLength, disp, extent );

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
@ -26,7 +26,6 @@
#include <stdio.h> #include <stdio.h>
#include "ompi/runtime/params.h" #include "ompi/runtime/params.h"
#include "ompi/communicator/communicator.h"
#include "ompi/datatype/ompi_datatype.h" #include "ompi/datatype/ompi_datatype.h"
#include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_convertor.h"

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2018 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -324,7 +324,8 @@ complete_contiguous_data_unpack:
return pConv->fAdvance( pConv, iov, out_size, max_data ); return pConv->fAdvance( pConv, iov, out_size, max_data );
} }
static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor, static inline int
opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor,
size_t starting_point, const size_t* sizes ) size_t starting_point, const size_t* sizes )
{ {
dt_stack_t* pStack; /* pointer to the position on the stack */ dt_stack_t* pStack; /* pointer to the position on the stack */
@ -349,14 +350,14 @@ static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t*
pStack[0].disp = count * extent; pStack[0].disp = count * extent;
/* now compute the number of pending bytes */ /* now compute the number of pending bytes */
count = starting_point - count * pData->size; count = starting_point % pData->size;
/** /**
* We save the current displacement starting from the begining * We save the current displacement starting from the begining
* of this data. * of this data.
*/ */
if( OPAL_LIKELY(0 == count) ) { if( OPAL_LIKELY(0 == count) ) {
pStack[1].type = pElems->elem.common.type; pStack[1].type = pElems->elem.common.type;
pStack[1].count = pElems->elem.count; pStack[1].count = pElems->elem.blocklen;
} else { } else {
pStack[1].type = OPAL_DATATYPE_UINT1; pStack[1].type = OPAL_DATATYPE_UINT1;
pStack[1].count = pData->size - count; pStack[1].count = pData->size - count;
@ -370,8 +371,8 @@ static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t*
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
static inline static inline int
int opal_convertor_create_stack_at_begining( opal_convertor_t* convertor, opal_convertor_create_stack_at_begining( opal_convertor_t* convertor,
const size_t* sizes ) const size_t* sizes )
{ {
dt_stack_t* pStack = convertor->pStack; dt_stack_t* pStack = convertor->pStack;
@ -402,7 +403,7 @@ int opal_convertor_create_stack_at_begining( opal_convertor_t* convertor,
pStack[1].count = pElems[0].loop.loops; pStack[1].count = pElems[0].loop.loops;
pStack[1].type = OPAL_DATATYPE_LOOP; pStack[1].type = OPAL_DATATYPE_LOOP;
} else { } else {
pStack[1].count = pElems[0].elem.count; pStack[1].count = pElems[0].elem.count * pElems[0].elem.blocklen;
pStack[1].type = pElems[0].elem.common.type; pStack[1].type = pElems[0].elem.common.type;
} }
return OPAL_SUCCESS; return OPAL_SUCCESS;
@ -578,8 +579,9 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
assert(! (convertor->flags & CONVERTOR_SEND)); assert(! (convertor->flags & CONVERTOR_SEND));
OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ); OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) { #if defined(CHECKSUM)
if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) { if( OPAL_UNLIKELY(convertor->flags & CONVERTOR_WITH_CHECKSUM) ) {
if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) {
convertor->fAdvance = opal_unpack_general_checksum; convertor->fAdvance = opal_unpack_general_checksum;
} else { } else {
if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
@ -588,8 +590,9 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
convertor->fAdvance = opal_generic_simple_unpack_checksum; convertor->fAdvance = opal_generic_simple_unpack_checksum;
} }
} }
} else { } else
if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) { #endif /* defined(CHECKSUM) */
if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) {
convertor->fAdvance = opal_unpack_general; convertor->fAdvance = opal_unpack_general;
} else { } else {
if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
@ -598,7 +601,6 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
convertor->fAdvance = opal_generic_simple_unpack; convertor->fAdvance = opal_generic_simple_unpack;
} }
} }
}
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }
@ -617,6 +619,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ); OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
#if defined(CHECKSUM)
if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) { if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) {
if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) { if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
convertor->fAdvance = opal_pack_general_checksum; convertor->fAdvance = opal_pack_general_checksum;
@ -631,7 +634,8 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
convertor->fAdvance = opal_generic_simple_pack_checksum; convertor->fAdvance = opal_generic_simple_pack_checksum;
} }
} }
} else { } else
#endif /* defined(CHECKSUM) */
if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) { if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
convertor->fAdvance = opal_pack_general; convertor->fAdvance = opal_pack_general;
} else { } else {
@ -645,7 +649,6 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
convertor->fAdvance = opal_generic_simple_pack; convertor->fAdvance = opal_generic_simple_pack;
} }
} }
}
return OPAL_SUCCESS; return OPAL_SUCCESS;
} }

Просмотреть файл

@ -332,8 +332,10 @@ opal_convertor_set_position( opal_convertor_t* convertor,
/* Remove the completed flag if it's already set */ /* Remove the completed flag if it's already set */
convertor->flags &= ~CONVERTOR_COMPLETED; convertor->flags &= ~CONVERTOR_COMPLETED;
if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && if( (convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) &&
(convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) && #if defined(CHECKSUM)
!(convertor->flags & CONVERTOR_WITH_CHECKSUM) &&
#endif /* defined(CHECKSUM) */
(convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) { (convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) {
/* Contiguous and no checkpoint and no homogeneous unpack */ /* Contiguous and no checkpoint and no homogeneous unpack */
convertor->bConverted = *position; convertor->bConverted = *position;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; -*- */
/* /*
* Copyright (c) 2004-2009 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
@ -30,6 +30,29 @@
#define DO_DEBUG(INST) #define DO_DEBUG(INST)
#endif /* OPAL_ENABLE_DEBUG */ #endif /* OPAL_ENABLE_DEBUG */
/* Take a new iovec (base + len) and try to merge it with what we already
* have. If we succeed return 0 and move forward, otherwise save it into a new
* iovec location. If we need to advance position and we reach the end
* of the iovec array, return 1 to signal we did not saved the last iovec.
*/
static inline int
opal_convertor_merge_iov( struct iovec* iov, uint32_t* iov_count,
IOVBASE_TYPE* base, size_t len,
uint32_t* idx )
{
if( 0 != iov[*idx].iov_len ) {
if( (base == ((char*)iov[*idx].iov_base + iov[*idx].iov_len)) ) {
iov[*idx].iov_len += len; /* merge with previous iovec */
return 0;
} /* cannot merge, move to the next position */
*idx = *idx + 1;
if( *idx == *iov_count ) return 1; /* do not overwrite outside the iovec array boundaries */
}
iov[*idx].iov_base = base;
iov[*idx].iov_len = len;
return 0;
}
/** /**
* This function always work in local representation. This means no representation * This function always work in local representation. This means no representation
* conversion (i.e. no heterogeneity) is taken into account, and that all * conversion (i.e. no heterogeneity) is taken into account, and that all
@ -44,9 +67,10 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
dt_stack_t* pStack; /* pointer to the position on the stack */ dt_stack_t* pStack; /* pointer to the position on the stack */
uint32_t pos_desc; /* actual position in the description of the derived datatype */ uint32_t pos_desc; /* actual position in the description of the derived datatype */
size_t count_desc; /* the number of items already done in the actual pos_desc */ size_t count_desc; /* the number of items already done in the actual pos_desc */
size_t do_now, blength;
dt_elem_desc_t* description, *pElem; dt_elem_desc_t* description, *pElem;
unsigned char *source_base; /* origin of the data */ unsigned char *source_base; /* origin of the data */
size_t raw_data = 0; /* sum of raw data lengths in the iov_len fields */ size_t sum_iov_len = 0; /* sum of raw data lengths in the iov_len fields */
uint32_t index = 0; /* the iov index and a simple counter */ uint32_t index = 0; /* the iov index and a simple counter */
assert( (*iov_count) > 0 ); assert( (*iov_count) > 0 );
@ -87,64 +111,86 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
pStack--; pStack--;
pConvertor->stack_pos--; pConvertor->stack_pos--;
pElem = &(description[pos_desc]); pElem = &(description[pos_desc]);
source_base += pStack->disp;
DO_DEBUG( opal_output( 0, "raw start pos_desc %d count_desc %" PRIsize_t " disp %ld\n" DO_DEBUG( opal_output( 0, "raw start pos_desc %d count_desc %" PRIsize_t " disp %ld\n"
"stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n", "stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n",
pos_desc, count_desc, (long)(source_base - pConvertor->pBaseBuf), pos_desc, count_desc, (long)(source_base - pConvertor->pBaseBuf),
pConvertor->stack_pos, pStack->index, pStack->count, (long)pStack->disp ); ); pConvertor->stack_pos, pStack->index, pStack->count, (long)pStack->disp ); );
while( 1 ) {
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { iov[index].iov_len = 0;
size_t blength = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; /* Special case if we start from a position that is in the middle of a data element blocklen.
source_base += pElem->elem.disp; * We can treat this outside the loop as it is an exception that can only happen once,
if( blength == (size_t)pElem->elem.extent ) { /* no resized data */ * and will simplify the loop handling.
if( index < *iov_count ) { */
blength *= count_desc; if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
/* now here we have a basic datatype */ const ddt_elem_desc_t* current = &(pElem->elem);
if( count_desc != (current->count * current->blocklen) ) { /* Not the full element description */
do_now = current->blocklen - (count_desc % current->blocklen); /* how much left in the block */
if( do_now ) {
source_base += current->disp;
blength = do_now * opal_datatype_basicDatatypes[current->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf,
pConvertor->pDesc, pConvertor->count ); pConvertor->pDesc, pConvertor->count );
DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %" PRIsize_t "}\n", DO_DEBUG( opal_output( 0, "raw 1. iov[%d] = {base %p, length %" PRIsize_t "}\n",
index, (void*)source_base, blength ); ); index, (void*)source_base, blength ); );
iov[index].iov_base = (IOVBASE_TYPE *) source_base; opal_convertor_merge_iov( iov, iov_count,
iov[index].iov_len = blength; (IOVBASE_TYPE *) source_base, blength, &index );
source_base += blength; /* not check the return value, we know there was at least one element in the iovec */
raw_data += blength; sum_iov_len += blength;
index++; count_desc -= do_now;
count_desc = 0;
source_base += (current->extent - current->disp +
(current->blocklen - do_now) * opal_datatype_basicDatatypes[current->common.type]->size);
} }
} else { }
for(size_t i = count_desc; (i > 0) && (index < *iov_count); i--, index++ ) { }
while( 1 ) {
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
const ddt_elem_desc_t* current = &(pElem->elem);
source_base += current->disp;
do_now = current->count;
if( count_desc != (current->count * current->blocklen) ) {
do_now = count_desc / current->blocklen;
assert( 0 == (count_desc % current->blocklen) );
}
blength = current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size;
for(size_t _i = 0; _i < do_now; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, blength, pConvertor->pBaseBuf,
pConvertor->pDesc, pConvertor->count ); pConvertor->pDesc, pConvertor->count );
DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %" PRIsize_t "}\n", DO_DEBUG( opal_output( 0, "raw 2. iov[%d] = {base %p, length %" PRIsize_t "}\n",
index, (void*)source_base, blength ); ); index, (void*)source_base, blength ); );
iov[index].iov_base = (IOVBASE_TYPE *) source_base; if( opal_convertor_merge_iov( iov, iov_count,
iov[index].iov_len = blength; (IOVBASE_TYPE *) source_base, blength, &index ) )
source_base += pElem->elem.extent; break; /* no more iovec available, bail out */
raw_data += blength;
count_desc--; source_base += current->extent;
sum_iov_len += blength;
count_desc -= current->blocklen;
} }
}
source_base -= pElem->elem.disp;
if( 0 == count_desc ) { /* completed */ if( 0 == count_desc ) { /* completed */
source_base = pConvertor->pBaseBuf + pStack->disp; source_base = pConvertor->pBaseBuf + pStack->disp;
pos_desc++; /* advance to the next data */ pos_desc++; /* advance to the next data */
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
continue; continue;
} }
source_base -= current->disp;
goto complete_loop; goto complete_loop;
} }
if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
DO_DEBUG( opal_output( 0, "raw end_loop count %" PRIsize_t " stack_pos %d" DO_DEBUG( opal_output( 0, "raw end_loop count %" PRIsize_t " stack_pos %d"
" pos_desc %d disp %ld space %lu\n", " pos_desc %d disp %ld space %" PRIsize_t "\n",
pStack->count, pConvertor->stack_pos, pStack->count, pConvertor->stack_pos,
pos_desc, (long)pStack->disp, (unsigned long)raw_data ); ); pos_desc, (long)pStack->disp, sum_iov_len ); );
if( --(pStack->count) == 0 ) { /* end of loop */ if( --(pStack->count) == 0 ) { /* end of loop */
if( pConvertor->stack_pos == 0 ) { if( 0 == pConvertor->stack_pos ) {
/* we lie about the size of the next element in order to /* we're done. Force the exit of the main for loop (around iovec) */
* make sure we exit the main loop. index++; /* account for the currently updating iovec */
*/ goto complete_loop;
*iov_count = index;
goto complete_loop; /* completed */
} }
pConvertor->stack_pos--; pConvertor->stack_pos--;
pStack--; pStack--;
@ -155,15 +201,15 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
pStack->disp += (pData->ub - pData->lb); pStack->disp += (pData->ub - pData->lb);
} else { } else {
assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type ); assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
pStack->disp += description[pStack->index].loop.extent; pStack->disp += description[pStack->index].loop.extent; /* jump by the loop extent */
} }
} }
source_base = pConvertor->pBaseBuf + pStack->disp; source_base = pConvertor->pBaseBuf + pStack->disp;
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
DO_DEBUG( opal_output( 0, "raw new_loop count %" PRIsize_t " stack_pos %d " DO_DEBUG( opal_output( 0, "raw new_loop count %" PRIsize_t " stack_pos %d "
"pos_desc %d disp %ld space %lu\n", "pos_desc %d disp %ld space %" PRIsize_t "\n",
pStack->count, pConvertor->stack_pos, pStack->count, pConvertor->stack_pos,
pos_desc, (long)pStack->disp, (unsigned long)raw_data ); ); pos_desc, (long)pStack->disp, sum_iov_len ); );
} }
if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
ptrdiff_t local_disp = (ptrdiff_t)source_base; ptrdiff_t local_disp = (ptrdiff_t)source_base;
@ -172,42 +218,39 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
ptrdiff_t offset = end_loop->first_elem_disp; ptrdiff_t offset = end_loop->first_elem_disp;
source_base += offset; source_base += offset;
for(size_t i = MIN(count_desc, *iov_count - index); i > 0; i--, index++ ) { for(; count_desc > 0; ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, end_loop->size, pConvertor->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( source_base, end_loop->size, pConvertor->pBaseBuf,
pConvertor->pDesc, pConvertor->count ); pConvertor->pDesc, pConvertor->count );
iov[index].iov_base = (IOVBASE_TYPE *) source_base; if( opal_convertor_merge_iov( iov, iov_count,
iov[index].iov_len = end_loop->size; (IOVBASE_TYPE *) source_base, end_loop->size, &index ) ) {
source_base += pElem->loop.extent;
raw_data += end_loop->size;
count_desc--;
DO_DEBUG( opal_output( 0, "raw contig loop generate iov[%d] = {base %p, length %" PRIsize_t "}"
"space %lu [pos_desc %d]\n",
index, iov[index].iov_base, iov[index].iov_len,
(unsigned long)raw_data, pos_desc ); );
}
source_base -= offset; source_base -= offset;
if( 0 == count_desc ) { /* completed */
pos_desc += pElem->loop.items + 1;
goto update_loop_description;
}
}
if( index == *iov_count ) { /* all iov have been filled, we need to bail out */
goto complete_loop; goto complete_loop;
} }
source_base += pElem->loop.extent;
sum_iov_len += end_loop->size;
count_desc--;
DO_DEBUG( opal_output( 0, "raw contig loop generate iov[%d] = {base %p, length %" PRIsize_t "}"
"space %" PRIsize_t " [pos_desc %d]\n",
index, iov[index].iov_base, iov[index].iov_len,
sum_iov_len, pos_desc ); );
}
source_base -= offset;
pos_desc += pElem->loop.items + 1;
} else {
local_disp = (ptrdiff_t)source_base - local_disp; local_disp = (ptrdiff_t)source_base - local_disp;
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc, PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
pStack->disp + local_disp); pStack->disp + local_disp);
pos_desc++; pos_desc++;
update_loop_description: /* update the current state */ }
source_base = pConvertor->pBaseBuf + pStack->disp; source_base = pConvertor->pBaseBuf + pStack->disp;
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" ); DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
continue;
} }
} }
complete_loop: complete_loop:
pConvertor->bConverted += raw_data; /* update the already converted bytes */ pConvertor->bConverted += sum_iov_len; /* update the already converted bytes */
*length = raw_data; *length = sum_iov_len;
*iov_count = index; *iov_count = index;
if( pConvertor->bConverted == pConvertor->local_size ) { if( pConvertor->bConverted == pConvertor->local_size ) {
pConvertor->flags |= CONVERTOR_COMPLETED; pConvertor->flags |= CONVERTOR_COMPLETED;

Просмотреть файл

@ -227,14 +227,42 @@ opal_datatype_is_contiguous_memory_layout( const opal_datatype_t* datatype, int3
} }
OPAL_DECLSPEC void opal_datatype_dump( const opal_datatype_t* pData ); OPAL_DECLSPEC void
opal_datatype_dump( const opal_datatype_t* pData );
/* data creation functions */ /* data creation functions */
OPAL_DECLSPEC int32_t opal_datatype_clone( const opal_datatype_t * src_type, opal_datatype_t * dest_type );
OPAL_DECLSPEC int32_t opal_datatype_create_contiguous( int count, const opal_datatype_t* oldType, opal_datatype_t** newType ); /**
OPAL_DECLSPEC int32_t opal_datatype_resize( opal_datatype_t* type, ptrdiff_t lb, ptrdiff_t extent ); * Create a duplicate of the source datatype.
OPAL_DECLSPEC int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtAdd, size_t count, */
OPAL_DECLSPEC int32_t
opal_datatype_clone( const opal_datatype_t* src_type,
opal_datatype_t* dest_type );
/**
* A contiguous array of identical datatypes.
*/
OPAL_DECLSPEC int32_t
opal_datatype_create_contiguous( int count, const opal_datatype_t* oldType,
opal_datatype_t** newType );
/**
* Add a new datatype to the base type description. The count is the number
* repetitions of the same element to be added, and the extent is the extent
* of each element. The displacement is the initial displacement of the
* first element.
*/
OPAL_DECLSPEC int32_t
opal_datatype_add( opal_datatype_t* pdtBase,
const opal_datatype_t* pdtAdd, size_t count,
ptrdiff_t disp, ptrdiff_t extent ); ptrdiff_t disp, ptrdiff_t extent );
/**
* Alter the lb and extent of an existing datatype in place.
*/
OPAL_DECLSPEC int32_t
opal_datatype_resize( opal_datatype_t* type,
ptrdiff_t lb,
ptrdiff_t extent );
static inline int32_t static inline int32_t
opal_datatype_type_lb( const opal_datatype_t* pData, ptrdiff_t* disp ) opal_datatype_type_lb( const opal_datatype_t* pData, ptrdiff_t* disp )
{ {

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -281,15 +281,23 @@ int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtA
if( (pdtAdd->flags & (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA)) == (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA) ) { if( (pdtAdd->flags & (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA)) == (OPAL_DATATYPE_FLAG_PREDEFINED | OPAL_DATATYPE_FLAG_DATA) ) {
if( NULL != pdtBase->ptypes ) if( NULL != pdtBase->ptypes )
pdtBase->ptypes[pdtAdd->id] += count; pdtBase->ptypes[pdtAdd->id] += count;
pLast->elem.common.type = pdtAdd->id;
pLast->elem.count = count;
pLast->elem.disp = disp;
pLast->elem.extent = extent;
pdtBase->desc.used++;
pLast->elem.common.flags = pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITTED); pLast->elem.common.flags = pdtAdd->flags & ~(OPAL_DATATYPE_FLAG_COMMITTED);
if( (extent != (ptrdiff_t)pdtAdd->size) && (count > 1) ) { /* gaps around the datatype */ pLast->elem.common.type = pdtAdd->id;
pLast->elem.disp = disp;
pLast->elem.extent = count * extent;
/* assume predefined datatypes without extent, aka. contiguous */
pLast->elem.count = 1;
pLast->elem.blocklen = count;
if( extent != (ptrdiff_t)pdtAdd->size ) { /* not contiguous: let's fix */
pLast->elem.count = count;
pLast->elem.blocklen = 1;
pLast->elem.extent = extent;
if( count > 1 ) { /* gaps around the predefined datatype */
pLast->elem.common.flags &= ~(OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS); pLast->elem.common.flags &= ~(OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_NO_GAPS);
} }
}
pdtBase->desc.used++;
} else { } else {
/* keep trace of the total number of basic datatypes in the datatype definition */ /* keep trace of the total number of basic datatypes in the datatype definition */
pdtBase->loops += pdtAdd->loops; pdtBase->loops += pdtAdd->loops;
@ -299,13 +307,40 @@ int32_t opal_datatype_add( opal_datatype_t* pdtBase, const opal_datatype_t* pdtA
for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ )
if( pdtAdd->ptypes[i] != 0 ) pdtBase->ptypes[i] += (count * pdtAdd->ptypes[i]); if( pdtAdd->ptypes[i] != 0 ) pdtBase->ptypes[i] += (count * pdtAdd->ptypes[i]);
} }
if( (1 == pdtAdd->desc.used) && (extent == (pdtAdd->ub - pdtAdd->lb)) && if( 1 == pdtAdd->desc.used ) {
(extent == pdtAdd->desc.desc[0].elem.extent) ){
pLast->elem = pdtAdd->desc.desc[0].elem; pLast->elem = pdtAdd->desc.desc[0].elem;
pLast->elem.count *= count;
pLast->elem.disp += disp; pLast->elem.disp += disp;
if( 1 == count ) {
/* Extent only has a meaning when there are multiple elements. Bail out */
} else if( 1 == pLast->elem.count ) {
/* The size and true_extent of the added datatype are identical, signaling a datatype
* that is mostly contiguous with the exception of the initial and final gaps. These
* gaps do not matter here as they will amended (the initial gaps being shifted by the
* new displacement and the final gap being replaced with the new gap
*/
if( pdtAdd->desc.desc[0].elem.extent == extent ) {
/* pure bliss everything is fully contiguous and we can collapse
* everything by updating the blocklen and extent
*/
pLast->elem.blocklen *= count;
pLast->elem.extent *= count;
} else {
pLast->elem.count = count;
pLast->elem.extent = extent;
}
} else if( extent == (ptrdiff_t)(pLast->elem.count * pLast->elem.extent) ) {
/* It's just a repetition of the same element, increase the count */
pLast->elem.count *= count;
} else {
/* No luck here, no optimization can be applied. Fall back to the
* normal case where we add a loop around the datatype.
*/
goto build_loop;
}
pdtBase->desc.used++; pdtBase->desc.used++;
} else { } else {
build_loop:
/* if the extent of the datatype is the same as the extent of the loop /* if the extent of the datatype is the same as the extent of the loop
* description of the datatype then we simply have to update the main loop. * description of the datatype then we simply have to update the main loop.
*/ */

Просмотреть файл

@ -48,37 +48,33 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM,
unsigned char* DESTINATION, unsigned char* DESTINATION,
size_t* SPACE ) size_t* SPACE )
{ {
size_t _copy_count = (COUNT);
size_t _copy_blength;
const ddt_elem_desc_t* _elem = &((ELEM)->elem); const ddt_elem_desc_t* _elem = &((ELEM)->elem);
unsigned char* _source = (SOURCE) + _elem->disp; unsigned char* _source = (SOURCE) + _elem->disp;
unsigned char* _destination = (DESTINATION) + _elem->disp; unsigned char* _destination = (DESTINATION) + _elem->disp;
size_t do_now = _elem->count, do_now_bytes;
_copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; assert( (COUNT) == (do_now * _elem->blocklen));
if( _copy_blength == (size_t)_elem->extent ) { /* We don't a prologue and epilogue here as we are __always__ working
_copy_blength *= _copy_count; * with full copies of the data description.
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), */
/**
* Compute how many full blocklen we need to do and do them.
*/
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
assert( (do_now * do_now_bytes) <= (*SPACE) );
for(size_t _i = 0; _i < do_now; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE),
(DATATYPE), (TOTAL_COUNT) ); (DATATYPE), (TOTAL_COUNT) );
/* the extent and the size of the basic datatype are equals */ DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n",
DO_DEBUG( opal_output( 0, "copy 1. %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n", STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) - _i * do_now_bytes ); );
STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, _copy_blength, *(SPACE) ); ); MEM_OP( _destination, _source, do_now_bytes );
MEM_OP( _destination, _source, _copy_blength );
_source += _copy_blength;
_destination += _copy_blength;
} else {
for(size_t _i = 0; _i < _copy_count; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE),
(DATATYPE), (TOTAL_COUNT) );
DO_DEBUG( opal_output( 0, "copy 2. %s( %p, %p, %lu ) => space %lu\n",
STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); );
MEM_OP( _destination, _source, _copy_blength );
_source += _elem->extent;
_destination += _elem->extent; _destination += _elem->extent;
_source += _elem->extent;
} }
_copy_blength *= _copy_count; *(SPACE) -= (do_now_bytes * do_now);
}
*(SPACE) -= _copy_blength;
} }
static inline void _contiguous_loop( const dt_elem_desc_t* ELEM, static inline void _contiguous_loop( const dt_elem_desc_t* ELEM,
@ -147,12 +143,10 @@ static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, i
if( (ptrdiff_t)datatype->size == extent ) { /* all contiguous == no gaps around */ if( (ptrdiff_t)datatype->size == extent ) { /* all contiguous == no gaps around */
size_t total_length = iov_len_local; size_t total_length = iov_len_local;
size_t memop_chunk = opal_datatype_memop_block_size; size_t memop_chunk = opal_datatype_memop_block_size;
OPAL_DATATYPE_SAFEGUARD_POINTER( source, iov_len_local,
(unsigned char*)source_base, datatype, count );
while( total_length > 0 ) { while( total_length > 0 ) {
if( memop_chunk > total_length ) memop_chunk = total_length; if( memop_chunk > total_length ) memop_chunk = total_length;
OPAL_DATATYPE_SAFEGUARD_POINTER( destination, memop_chunk,
(unsigned char*)destination_base, datatype, count );
OPAL_DATATYPE_SAFEGUARD_POINTER( source, memop_chunk,
(unsigned char*)source_base, datatype, count );
DO_DEBUG( opal_output( 0, "copy c1. %s( %p, %p, %lu ) => space %lu\n", DO_DEBUG( opal_output( 0, "copy c1. %s( %p, %p, %lu ) => space %lu\n",
STRINGIFY(MEM_OP_NAME), (void*)destination, (void*)source, (unsigned long)memop_chunk, (unsigned long)total_length ); ); STRINGIFY(MEM_OP_NAME), (void*)destination, (void*)source, (unsigned long)memop_chunk, (unsigned long)total_length ); );
MEM_OP( destination, source, memop_chunk ); MEM_OP( destination, source, memop_chunk );
@ -184,17 +178,12 @@ static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, i
pos_desc = 0; pos_desc = 0;
stack_pos = 0; stack_pos = 0;
if( datatype->opt_desc.desc != NULL ) {
description = datatype->opt_desc.desc; description = datatype->opt_desc.desc;
} else { if( NULL == description ) {
description = datatype->desc.desc; description = datatype->desc.desc;
} }
if( description[0].elem.common.type == OPAL_DATATYPE_LOOP ) UPDATE_INTERNAL_COUNTERS( description, 0, pElem, count_desc );
count_desc = description[0].loop.loops;
else
count_desc = description[0].elem.count;
pElem = &(description[pos_desc]);
while( 1 ) { while( 1 ) {
while( OPAL_LIKELY(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) ) { while( OPAL_LIKELY(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) ) {

Просмотреть файл

@ -69,14 +69,14 @@ ssize_t opal_datatype_get_element_count( const opal_datatype_t* datatype, size_t
while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
/* now here we have a basic datatype */ /* now here we have a basic datatype */
const opal_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]); const opal_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]);
local_size = pElems[pos_desc].elem.count * basic_type->size; local_size = (pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen) * basic_type->size;
if( local_size >= iSize ) { if( local_size >= iSize ) {
local_size = iSize / basic_type->size; local_size = iSize / basic_type->size;
nbElems += (int32_t)local_size; nbElems += (int32_t)local_size;
iSize -= local_size * basic_type->size; iSize -= local_size * basic_type->size;
return (iSize == 0 ? nbElems : -1); return (iSize == 0 ? nbElems : -1);
} }
nbElems += pElems[pos_desc].elem.count; nbElems += (pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen);
iSize -= local_size; iSize -= local_size;
pos_desc++; /* advance to the next data */ pos_desc++; /* advance to the next data */
} }
@ -131,7 +131,7 @@ int32_t opal_datatype_set_element_count( const opal_datatype_t* datatype, size_t
while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
/* now here we have a basic datatype */ /* now here we have a basic datatype */
const opal_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]); const opal_datatype_t* basic_type = BASIC_DDT_FROM_ELEM(pElems[pos_desc]);
local_length = pElems[pos_desc].elem.count; local_length = (pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen);
if( local_length >= count ) { if( local_length >= count ) {
*length += count * basic_type->size; *length += count * basic_type->size;
return 0; return 0;
@ -188,8 +188,8 @@ int opal_datatype_compute_ptypes( opal_datatype_t* datatype )
} }
while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { while( pElems[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
/* now here we have a basic datatype */ /* now here we have a basic datatype */
datatype->ptypes[pElems[pos_desc].elem.common.type] += pElems[pos_desc].elem.count; datatype->ptypes[pElems[pos_desc].elem.common.type] += pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen;
nbElems += pElems[pos_desc].elem.count; nbElems += pElems[pos_desc].elem.count * pElems[pos_desc].elem.blocklen;
DUMP( " compute_ptypes-add: type %d count %"PRIsize_t" (total type %"PRIsize_t" total %lld)\n", DUMP( " compute_ptypes-add: type %d count %"PRIsize_t" (total type %"PRIsize_t" total %lld)\n",
pElems[pos_desc].elem.common.type, datatype->ptypes[pElems[pos_desc].elem.common.type], pElems[pos_desc].elem.common.type, datatype->ptypes[pElems[pos_desc].elem.common.type],

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2018 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -217,19 +217,23 @@ union dt_elem_desc {
/** /**
* Create one or more elements depending on the value of _count. If the value * Create an element entry in the description. If the element is contiguous
* is too large for the type of elem.count then use oth the elem.count and * collapse everything into the blocklen.
* elem.blocklen to create it. If the number is prime then create a second
* element to account for the difference.
*/ */
#define CREATE_ELEM( _place, _type, _flags, _count, _disp, _extent ) \ #define CREATE_ELEM(_place, _type, _flags, _blocklen, _count, _disp, _extent) \
do { \ do { \
(_place)->elem.common.flags = (_flags) | OPAL_DATATYPE_FLAG_DATA; \ (_place)->elem.common.flags = (_flags) | OPAL_DATATYPE_FLAG_DATA; \
(_place)->elem.common.type = (_type); \ (_place)->elem.common.type = (_type); \
(_place)->elem.disp = (_disp); \ (_place)->elem.blocklen = (_blocklen); \
(_place)->elem.extent = (_extent); \
(_place)->elem.count = (_count); \ (_place)->elem.count = (_count); \
(_place)->elem.blocklen = 1; \ (_place)->elem.extent = (_extent); \
(_place)->elem.disp = (_disp); \
if( _extent == (ptrdiff_t)(_blocklen * opal_datatype_basicDatatypes[_type]->size) ) { \
/* collapse it into a single large blocklen */ \
(_place)->elem.blocklen *= _count; \
(_place)->elem.extent *= _count; \
(_place)->elem.count = 1; \
} \
} while(0) } while(0)
/* /*
* This array holds the descriptions desc.desc[2] of the predefined basic datatypes. * This array holds the descriptions desc.desc[2] of the predefined basic datatypes.
@ -503,17 +507,17 @@ static inline int GET_FIRST_NON_LOOP( const union dt_elem_desc* _pElem )
if( OPAL_DATATYPE_LOOP == (ELEMENT)->elem.common.type ) \ if( OPAL_DATATYPE_LOOP == (ELEMENT)->elem.common.type ) \
(COUNTER) = (ELEMENT)->loop.loops; \ (COUNTER) = (ELEMENT)->loop.loops; \
else \ else \
(COUNTER) = (ELEMENT)->elem.count; \ (COUNTER) = (ELEMENT)->elem.count * (ELEMENT)->elem.blocklen; \
} while (0) } while (0)
OPAL_DECLSPEC int opal_datatype_contain_basic_datatypes( const struct opal_datatype_t* pData, char* ptr, size_t length ); OPAL_DECLSPEC int opal_datatype_contain_basic_datatypes( const struct opal_datatype_t* pData, char* ptr, size_t length );
OPAL_DECLSPEC int opal_datatype_dump_data_flags( unsigned short usflags, char* ptr, size_t length ); OPAL_DECLSPEC int opal_datatype_dump_data_flags( unsigned short usflags, char* ptr, size_t length );
OPAL_DECLSPEC int opal_datatype_dump_data_desc( union dt_elem_desc* pDesc, int nbElems, char* ptr, size_t length ); OPAL_DECLSPEC int opal_datatype_dump_data_desc( union dt_elem_desc* pDesc, int nbElems, char* ptr, size_t length );
#if OPAL_ENABLE_DEBUG
extern bool opal_position_debug; extern bool opal_position_debug;
extern bool opal_copy_debug; extern bool opal_copy_debug;
#endif /* OPAL_ENABLE_DEBUG */ extern bool opal_unpack_debug;
extern bool opal_pack_debug;
END_C_DECLS END_C_DECLS
#endif /* OPAL_DATATYPE_INTERNAL_H_HAS_BEEN_INCLUDED */ #endif /* OPAL_DATATYPE_INTERNAL_H_HAS_BEEN_INCLUDED */

Просмотреть файл

@ -252,8 +252,8 @@ int32_t opal_datatype_init( void )
OPAL_DATATYPE_FLAG_CONTIGUOUS | OPAL_DATATYPE_FLAG_CONTIGUOUS |
OPAL_DATATYPE_FLAG_NO_GAPS; OPAL_DATATYPE_FLAG_NO_GAPS;
datatype->desc.desc[0].elem.common.type = i; datatype->desc.desc[0].elem.common.type = i;
/* datatype->desc.desc[0].elem.blocklen XXX not set at the moment, it will be needed later */
datatype->desc.desc[0].elem.count = 1; datatype->desc.desc[0].elem.count = 1;
datatype->desc.desc[0].elem.blocklen = 1;
datatype->desc.desc[0].elem.disp = 0; datatype->desc.desc[0].elem.disp = 0;
datatype->desc.desc[0].elem.extent = datatype->size; datatype->desc.desc[0].elem.extent = datatype->size;

Просмотреть файл

@ -2,6 +2,9 @@
/* /*
* Copyright (c) 2018 Research Organization for Information Science * Copyright (c) 2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2018-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -18,35 +21,43 @@
#include "opal/datatype/opal_datatype_internal.h" #include "opal/datatype/opal_datatype_internal.h"
#include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_convertor.h"
#define OPAL_DATATYPE_MAX_MONOTONIC_IOVEC 32
/**
* Check if the datatype describes a memory layout where the pointers to
* the contiguous pieces are always advancing in the same direction, i.e.
* there is no potential for overlap.
*/
int32_t opal_datatype_is_monotonic(opal_datatype_t* type ) int32_t opal_datatype_is_monotonic(opal_datatype_t* type )
{ {
struct iovec iov[OPAL_DATATYPE_MAX_MONOTONIC_IOVEC];
ptrdiff_t upper_limit = (ptrdiff_t)type->true_lb; /* as conversion base will be NULL the first address is true_lb */
size_t max_data = 0x7FFFFFFF;
opal_convertor_t *pConv; opal_convertor_t *pConv;
uint32_t iov_count;
struct iovec iov[5];
size_t max_data = 0;
long prev = -1;
int rc;
bool monotonic = true; bool monotonic = true;
uint32_t iov_count;
int rc;
pConv = opal_convertor_create( opal_local_arch, 0 ); pConv = opal_convertor_create( opal_local_arch, 0 );
if (OPAL_UNLIKELY(NULL == pConv)) { if (OPAL_UNLIKELY(NULL == pConv)) {
return 0; return -1;
} }
rc = opal_convertor_prepare_for_send( pConv, type, 1, NULL ); rc = opal_convertor_prepare_for_send( pConv, type, 1, NULL );
if( OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { if( OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
OBJ_RELEASE(pConv); OBJ_RELEASE(pConv);
return 0; return -1;
} }
do { do {
iov_count = 5; iov_count = OPAL_DATATYPE_MAX_MONOTONIC_IOVEC;
rc = opal_convertor_raw( pConv, iov, &iov_count, &max_data); rc = opal_convertor_raw( pConv, iov, &iov_count, &max_data);
for (uint32_t i=0; i<iov_count; i++) { for (uint32_t i = 0; i < iov_count; i++) {
if ((long)iov[i].iov_base < prev) { if ((ptrdiff_t)iov[i].iov_base < upper_limit) {
monotonic = false; monotonic = false;
goto cleanup; goto cleanup;
} }
prev = (long)iov[i].iov_base; /* The new upper bound is at the end of the iovec */
upper_limit = (ptrdiff_t)iov[i].iov_base + iov[i].iov_len;
} }
} while (rc != 1); } while (rc != 1);

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -30,32 +30,19 @@
#include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_internal.h" #include "opal/datatype/opal_datatype_internal.h"
#define SET_EMPTY_ELEMENT( ELEM ) \
do { \
ddt_elem_desc_t* _elem = (ELEM); \
_elem->common.flags = OPAL_DATATYPE_FLAG_BASIC; \
_elem->common.type = OPAL_DATATYPE_LOOP; \
_elem->count = 0; \
_elem->disp = 0; \
_elem->extent = 0; \
} while (0)
static int32_t static int32_t
opal_datatype_optimize_short( opal_datatype_t* pData, opal_datatype_optimize_short( opal_datatype_t* pData,
size_t count, size_t count,
dt_type_desc_t* pTypeDesc ) dt_type_desc_t* pTypeDesc )
{ {
dt_elem_desc_t* pElemDesc; dt_elem_desc_t* pElemDesc;
ddt_elem_desc_t opt_elem; dt_stack_t *pOrigStack, *pStack; /* pointer to the position on the stack */
dt_stack_t* pOrigStack;
dt_stack_t* pStack; /* pointer to the position on the stack */
int32_t pos_desc = 0; /* actual position in the description of the derived datatype */ int32_t pos_desc = 0; /* actual position in the description of the derived datatype */
int32_t stack_pos = 0, last_type = OPAL_DATATYPE_UINT1; int32_t stack_pos = 0;
int32_t type = OPAL_DATATYPE_LOOP, nbElems = 0, continuity; int32_t nbElems = 0;
ptrdiff_t total_disp = 0, last_extent = 1, last_disp = 0; ptrdiff_t total_disp = 0;
uint16_t last_flags = 0xFFFF; /* keep all for the first datatype */ ddt_elem_desc_t last = {.common.flags = 0xFFFF /* all on */, .count = 0, .disp = 0}, compress;
uint32_t i; ddt_elem_desc_t* current;
size_t last_length = 0;
pOrigStack = pStack = (dt_stack_t*)malloc( sizeof(dt_stack_t) * (pData->loops+2) ); pOrigStack = pStack = (dt_stack_t*)malloc( sizeof(dt_stack_t) * (pData->loops+2) );
SAVE_STACK( pStack, -1, 0, count, 0 ); SAVE_STACK( pStack, -1, 0, count, 0 );
@ -64,186 +51,199 @@ opal_datatype_optimize_short( opal_datatype_t* pData,
pTypeDesc->desc = pElemDesc = (dt_elem_desc_t*)malloc( sizeof(dt_elem_desc_t) * pTypeDesc->length ); pTypeDesc->desc = pElemDesc = (dt_elem_desc_t*)malloc( sizeof(dt_elem_desc_t) * pTypeDesc->length );
pTypeDesc->used = 0; pTypeDesc->used = 0;
SET_EMPTY_ELEMENT( &opt_elem );
assert( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pData->desc.used].elem.common.type ); assert( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pData->desc.used].elem.common.type );
opt_elem.common.type = OPAL_DATATYPE_LOOP;
opt_elem.common.flags = 0xFFFF; /* keep all for the first datatype */
opt_elem.count = 0;
opt_elem.disp = pData->desc.desc[pData->desc.used].end_loop.first_elem_disp;
opt_elem.extent = 0;
while( stack_pos >= 0 ) { while( stack_pos >= 0 ) {
if( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { /* end of the current loop */ if( OPAL_DATATYPE_END_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { /* end of the current loop */
ddt_endloop_desc_t* end_loop = &(pData->desc.desc[pos_desc].end_loop); ddt_endloop_desc_t* end_loop = &(pData->desc.desc[pos_desc].end_loop);
if( last_length != 0 ) { if( 0 != last.count ) {
CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
last.blocklen, last.count, last.disp, last.extent );
pElemDesc++; nbElems++; pElemDesc++; nbElems++;
last_disp += last_length; last.count= 0;
last_length = 0;
} }
CREATE_LOOP_END( pElemDesc, nbElems - pStack->index + 1, /* # of elems in this loop */ CREATE_LOOP_END( pElemDesc, nbElems - pStack->index + 1, /* # of elems in this loop */
end_loop->first_elem_disp, end_loop->size, end_loop->common.flags ); end_loop->first_elem_disp, end_loop->size, end_loop->common.flags );
pElemDesc++; nbElems++;
if( --stack_pos >= 0 ) { /* still something to do ? */ if( --stack_pos >= 0 ) { /* still something to do ? */
ddt_loop_desc_t* pStartLoop = &(pTypeDesc->desc[pStack->index - 1].loop); ddt_loop_desc_t* pStartLoop = &(pTypeDesc->desc[pStack->index - 1].loop);
pStartLoop->items = end_loop->items; pStartLoop->items = pElemDesc->end_loop.items;
total_disp = pStack->disp; /* update the displacement position */ total_disp = pStack->disp; /* update the displacement position */
} }
pElemDesc++; nbElems++;
pStack--; /* go down one position on the stack */ pStack--; /* go down one position on the stack */
pos_desc++; pos_desc++;
continue; continue;
} }
if( OPAL_DATATYPE_LOOP == pData->desc.desc[pos_desc].elem.common.type ) { if( OPAL_DATATYPE_LOOP == pData->desc.desc[pos_desc].elem.common.type ) {
ddt_loop_desc_t* loop = (ddt_loop_desc_t*)&(pData->desc.desc[pos_desc]); ddt_loop_desc_t* loop = (ddt_loop_desc_t*)&(pData->desc.desc[pos_desc]);
ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)&(pData->desc.desc[pos_desc + loop->items]);
int index = GET_FIRST_NON_LOOP( &(pData->desc.desc[pos_desc]) ); int index = GET_FIRST_NON_LOOP( &(pData->desc.desc[pos_desc]) );
ptrdiff_t loop_disp = pData->desc.desc[pos_desc + index].elem.disp;
continuity = ((last_disp + (ptrdiff_t)last_length * (ptrdiff_t)opal_datatype_basicDatatypes[last_type]->size)
== (total_disp + loop_disp));
if( loop->common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { if( loop->common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
/* the loop is contiguous or composed by contiguous elements with a gap */ ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)&(pData->desc.desc[pos_desc + loop->items]);
if( loop->extent == (ptrdiff_t)end_loop->size ) {
/* the whole loop is contiguous */ assert(pData->desc.desc[pos_desc + index].elem.disp == end_loop->first_elem_disp);
if( !continuity ) { compress.common.flags = loop->common.flags;
if( 0 != last_length ) { compress.common.type = pData->desc.desc[pos_desc + index].elem.common.type;
CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, compress.blocklen = pData->desc.desc[pos_desc + index].elem.blocklen;
last_length, last_disp, last_extent ); for( uint32_t i = index+1; i < loop->items; i++ ) {
pElemDesc++; nbElems++; current = &pData->desc.desc[pos_desc + i].elem;
last_length = 0; assert(1 == current->count);
if( (current->common.type == OPAL_DATATYPE_LOOP) ||
compress.common.type != current->common.type ) {
compress.common.type = OPAL_DATATYPE_UINT1;
compress.blocklen = end_loop->size;
break;
} }
last_disp = total_disp + loop_disp; compress.blocklen += current->blocklen;
} }
last_length = (last_length * opal_datatype_basicDatatypes[last_type]->size compress.count = loop->loops;
+ loop->loops * end_loop->size); compress.extent = loop->extent;
last_type = OPAL_DATATYPE_UINT1; compress.disp = end_loop->first_elem_disp;
last_extent = 1; if( compress.extent == (ptrdiff_t)(compress.blocklen * opal_datatype_basicDatatypes[compress.common.type]->size) ) {
} else { /* The compressed element is contiguous: collapse it into a single large blocklen */
int counter = loop->loops; compress.blocklen *= compress.count;
ptrdiff_t merged_disp = 0; compress.extent *= compress.count;
/* if the previous data is contiguous with this piece and it has a length not ZERO */ compress.count = 1;
if( last_length != 0 ) {
if( continuity ) {
last_length *= opal_datatype_basicDatatypes[last_type]->size;
last_length += end_loop->size;
last_type = OPAL_DATATYPE_UINT1;
last_extent = 1;
counter--;
merged_disp = loop->extent; /* merged loop, update the disp of the remaining elems */
}
CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC,
last_length, last_disp, last_extent );
pElemDesc++; nbElems++;
last_disp += last_length;
last_length = 0;
last_type = OPAL_DATATYPE_LOOP;
} }
/** /**
* The content of the loop is contiguous (maybe with a gap before or after). * The current loop has been compressed and can now be treated as if it
* * was a data element. We can now look if it can be fused with last,
* If any of the loops have been merged with the previous element, then the * as done in the fusion of 2 elements below. Let's use the same code.
* displacement of the first element (or the displacement of all elements if the
* loop will be removed) must be updated accordingly.
*/ */
if( counter <= 2 ) {
merged_disp += end_loop->first_elem_disp;
while( counter > 0 ) {
CREATE_ELEM( pElemDesc, OPAL_DATATYPE_UINT1, OPAL_DATATYPE_FLAG_BASIC,
end_loop->size, merged_disp, 1);
pElemDesc++; nbElems++; counter--;
merged_disp += loop->extent;
}
} else {
CREATE_LOOP_START( pElemDesc, counter, 2, loop->extent, loop->common.flags );
pElemDesc++; nbElems++;
CREATE_ELEM( pElemDesc, OPAL_DATATYPE_UINT1, OPAL_DATATYPE_FLAG_BASIC,
end_loop->size, loop_disp, 1);
pElemDesc++; nbElems++;
CREATE_LOOP_END( pElemDesc, 2, end_loop->first_elem_disp + merged_disp,
end_loop->size, end_loop->common.flags );
pElemDesc++; nbElems++;
}
}
pos_desc += loop->items + 1; pos_desc += loop->items + 1;
} else { current = &compress;
ddt_elem_desc_t* elem = (ddt_elem_desc_t*)&(pData->desc.desc[pos_desc+1]); goto fuse_loops;
if( last_length != 0 ) {
CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent );
pElemDesc++; nbElems++;
last_disp += last_length;
last_length = 0;
last_type = OPAL_DATATYPE_LOOP;
} }
if( 2 == loop->items ) { /* small loop */
if( (1 == elem->count) /**
&& (elem->extent == (ptrdiff_t)opal_datatype_basicDatatypes[elem->common.type]->size) ) { * If the content of the loop is not contiguous there is little we can do
CREATE_ELEM( pElemDesc, elem->common.type, elem->common.flags & ~OPAL_DATATYPE_FLAG_CONTIGUOUS, * that would not incur significant optimization cost and still be beneficial
loop->loops, elem->disp, loop->extent ); * in reducing the number of memcpy during pack/unpack.
*/
if( 0 != last.count ) { /* Generate the pending element */
CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
last.blocklen, last.count, last.disp, last.extent );
pElemDesc++; nbElems++; pElemDesc++; nbElems++;
pos_desc += loop->items + 1; last.count = 0;
goto complete_loop; last.common.type = OPAL_DATATYPE_LOOP;
} else if( loop->loops < 3 ) { }
ptrdiff_t elem_displ = elem->disp;
for( i = 0; i < loop->loops; i++ ) { /* Can we unroll the loop? */
CREATE_ELEM( pElemDesc, elem->common.type, elem->common.flags, if( (loop->items <= 3) && (loop->loops <= 2) ) {
elem->count, elem_displ, elem->extent ); ptrdiff_t elem_displ = 0;
for( uint32_t i = 0; i < loop->loops; i++ ) {
for( uint32_t j = 0; j < (loop->items - 1); j++ ) {
current = &pData->desc.desc[pos_desc + index + j].elem;
CREATE_ELEM( pElemDesc, current->common.type, current->common.flags,
current->blocklen, current->count, current->disp + elem_displ, current->extent );
pElemDesc++; nbElems++;
}
elem_displ += loop->extent; elem_displ += loop->extent;
pElemDesc++; nbElems++;
} }
pos_desc += loop->items + 1; pos_desc += loop->items + 1;
goto complete_loop; goto complete_loop;
} }
}
CREATE_LOOP_START( pElemDesc, loop->loops, loop->items, loop->extent, loop->common.flags ); CREATE_LOOP_START( pElemDesc, loop->loops, loop->items, loop->extent, loop->common.flags );
pElemDesc++; nbElems++; pElemDesc++; nbElems++;
PUSH_STACK( pStack, stack_pos, nbElems, OPAL_DATATYPE_LOOP, loop->loops, total_disp ); PUSH_STACK( pStack, stack_pos, nbElems, OPAL_DATATYPE_LOOP, loop->loops, total_disp );
pos_desc++; pos_desc++;
DDT_DUMP_STACK( pStack, stack_pos, pData->desc.desc, "advance loops" ); DDT_DUMP_STACK( pStack, stack_pos, pData->desc.desc, "advance loops" );
}
complete_loop: complete_loop:
total_disp = pStack->disp; /* update the displacement */ total_disp = pStack->disp; /* update the displacement */
continue; continue;
} }
while( pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* keep doing it until we reach a non datatype element */ while( pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { /* go over all basic datatype elements */
/* now here we have a basic datatype */ current = &pData->desc.desc[pos_desc].elem;
type = pData->desc.desc[pos_desc].elem.common.type; pos_desc++; /* point to the next element as current points to the current one */
continuity = ((last_disp + (ptrdiff_t)last_length * (ptrdiff_t)opal_datatype_basicDatatypes[last_type]->size)
== (total_disp + pData->desc.desc[pos_desc].elem.disp));
if( (pData->desc.desc[pos_desc].elem.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && continuity && fuse_loops:
(pData->desc.desc[pos_desc].elem.extent == (int32_t)opal_datatype_basicDatatypes[type]->size) ) { if( 0 == last.count ) { /* first data of the datatype */
if( type == last_type ) { last = *current;
last_length += pData->desc.desc[pos_desc].elem.count; continue; /* next data */
last_extent = pData->desc.desc[pos_desc].elem.extent; }
} else {
if( last_length == 0 ) { /* are the two elements compatible: aka they have very similar values and they
last_type = type; * can be merged together by increasing the count, and/or changing the extent.
last_length = pData->desc.desc[pos_desc].elem.count; */
last_extent = pData->desc.desc[pos_desc].elem.extent; if( (last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) ==
} else { (current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size) ) {
last_length = last_length * opal_datatype_basicDatatypes[last_type]->size + ddt_elem_desc_t save = last; /* safekeep the type and blocklen */
pData->desc.desc[pos_desc].elem.count * opal_datatype_basicDatatypes[type]->size; if( last.common.type != current->common.type ) {
last_type = OPAL_DATATYPE_UINT1; last.blocklen *= opal_datatype_basicDatatypes[last.common.type]->size;
last_extent = 1; last.common.type = OPAL_DATATYPE_UINT1;
}
if( 1 == last.count ) {
/* we can ignore the extent of the element with count == 1 and merge them together if their displacements match */
if( 1 == current->count ) {
last.extent = current->disp - last.disp;
last.count++;
continue;
}
/* can we compute a matching displacement ? */
if( (last.disp + current->extent) == current->disp ) {
last.extent = current->extent;
last.count = current->count + 1;
continue;
} }
} }
last_flags &= pData->desc.desc[pos_desc].elem.common.flags; if( (last.extent * (ptrdiff_t)last.count + last.disp) == current->disp ) {
} else { if( 1 == current->count ) {
if( last_length != 0 ) { last.count++;
CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); continue;
}
if( last.extent == current->extent ) {
last.count += current->count;
continue;
}
}
last.blocklen = save.blocklen;
last.common.type = save.common.type;
/* try other optimizations */
}
/* are the elements fusionable such that we can fusion the last blocklen of one with the first
* blocklen of the other.
*/
if( (ptrdiff_t)(last.disp + (last.count - 1) * last.extent + last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) ==
current->disp ) {
if( last.count != 1 ) {
CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
last.blocklen, last.count - 1, last.disp, last.extent );
pElemDesc++; nbElems++; pElemDesc++; nbElems++;
last.disp += (last.count - 1) * last.extent;
last.count = 1;
} }
last_disp = total_disp + pData->desc.desc[pos_desc].elem.disp; if( last.common.type == current->common.type ) {
last_length = pData->desc.desc[pos_desc].elem.count; last.blocklen += current->blocklen;
last_extent = pData->desc.desc[pos_desc].elem.extent; } else {
last_type = type; last.blocklen = ((last.blocklen * opal_datatype_basicDatatypes[last.common.type]->size) +
(current->blocklen * opal_datatype_basicDatatypes[current->common.type]->size));
last.common.type = OPAL_DATATYPE_UINT1;
} }
pos_desc++; /* advance to the next data */ last.extent += current->extent;
if( current->count != 1 ) {
CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
last.blocklen, last.count, last.disp, last.extent );
pElemDesc++; nbElems++;
last = *current;
last.count -= 1;
last.disp += last.extent;
}
continue;
}
CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
last.blocklen, last.count, last.disp, last.extent );
pElemDesc++; nbElems++;
last = *current;
} }
} }
if( last_length != 0 ) { if( 0 != last.count ) {
CREATE_ELEM( pElemDesc, last_type, OPAL_DATATYPE_FLAG_BASIC, last_length, last_disp, last_extent ); CREATE_ELEM( pElemDesc, last.common.type, OPAL_DATATYPE_FLAG_BASIC,
last.blocklen, last.count, last.disp, last.extent );
pElemDesc++; nbElems++; pElemDesc++; nbElems++;
} }
/* cleanup the stack */ /* cleanup the stack */

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -53,8 +53,6 @@
#endif /* defined(CHECKSUM) */ #endif /* defined(CHECKSUM) */
#define IOVEC_MEM_LIMIT 8192
/* the contig versions does not use the stack. They can easily retrieve /* the contig versions does not use the stack. They can easily retrieve
* the status with just the informations from pConvertor->bConverted. * the status with just the informations from pConvertor->bConverted.
*/ */
@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv,
unsigned char *source_base = NULL; unsigned char *source_base = NULL;
uint32_t iov_count; uint32_t iov_count;
size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted; size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted;
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp); source_base = (pConv->pBaseBuf + pConv->pDesc->true_lb + pStack[0].disp + pStack[1].disp);
/* There are some optimizations that can be done if the upper level /* There are some optimizations that can be done if the upper level
* does not provide a buffer. * does not provide a buffer.
@ -111,15 +108,18 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
uint32_t* out_size, uint32_t* out_size,
size_t* max_data ) size_t* max_data )
{ {
size_t remaining, length, initial_bytes_converted = pConv->bConverted;
const opal_datatype_t* pData = pConv->pDesc; const opal_datatype_t* pData = pConv->pDesc;
dt_stack_t* stack = pConv->pStack; dt_stack_t* stack = pConv->pStack;
ptrdiff_t extent = pData->ub - pData->lb;
unsigned char *user_memory, *packed_buffer; unsigned char *user_memory, *packed_buffer;
uint32_t iov_count, index; uint32_t idx;
size_t i; size_t i;
size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted;
ptrdiff_t extent= pData->ub - pData->lb;
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
/* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb
* is the initial displacement, the size the length of the contiguous area and the extent represent
* how much we should jump between elements.
*/
assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) ); assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) );
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n",
(void*)pConv->pBaseBuf, *out_size ); ); (void*)pConv->pBaseBuf, *out_size ); );
@ -127,139 +127,97 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
stack[1].type = opal_datatype_uint1.id; stack[1].type = opal_datatype_uint1.id;
} }
/* We can provide directly the pointers in the user buffers (like the convertor_raw) */
if( NULL == iov[0].iov_base ) {
user_memory = pConv->pBaseBuf + pData->true_lb;
/* There are some optimizations that can be done if the upper level for( idx = 0; (idx < (*out_size)) && stack[0].count; idx++ ) {
* does not provide a buffer. iov[idx].iov_base = user_memory + stack[0].disp + stack[1].disp;
*/ iov[idx].iov_len = stack[1].count;
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv );
pConv->bConverted += stack[1].count;
stack[0].disp += extent;
stack[0].count--;
stack[1].disp = 0;
stack[1].count = pData->size; /* we might need this to update the partial
* length for the first iteration */
}
goto update_status_and_return;
}
for( idx = 0; idx < (*out_size); idx++ ) {
/* Limit the amount of packed data to the data left over on this convertor */ /* Limit the amount of packed data to the data left over on this convertor */
remaining = pConv->local_size - pConv->bConverted; remaining = pConv->local_size - pConv->bConverted;
if( 0 == remaining ) break; /* we're done this time */ if( 0 == remaining ) break; /* we're done this time */
if( remaining > iov[iov_count].iov_len ) if( remaining > iov[idx].iov_len )
remaining = iov[iov_count].iov_len; remaining = iov[idx].iov_len;
packed_buffer = (unsigned char *)iov[iov_count].iov_base; packed_buffer = (unsigned char *)iov[idx].iov_base;
bConverted = remaining; /* how much will get unpacked this time */ pConv->bConverted += remaining;
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp + stack[1].disp; user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
i = pConv->count - stack[0].count; /* how many we already packed */
assert(i == (pConv->bConverted / pData->size));
if( packed_buffer == NULL ) { DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n",
/* special case for small data. We avoid allocating memory if we (void*)user_memory, (void*)packed_buffer, remaining ); );
* can fill the iovec directly with the address of the remaining
* data.
*/
if( stack->count < (size_t)((*out_size) - iov_count) ) {
stack[1].count = pData->size - (pConv->bConverted % pData->size);
for( index = iov_count; i < pConv->count; i++, index++ ) {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = stack[1].count;
stack[0].disp += extent;
pConv->bConverted += stack[1].count;
stack[1].disp = 0; /* reset it for the next round */
stack[1].count = pData->size;
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp;
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
}
*out_size = iov_count + index;
*max_data = (pConv->bConverted - initial_bytes_converted);
pConv->flags |= CONVERTOR_COMPLETED;
return 1; /* we're done */
}
/* now special case for big contiguous data with gaps around */
if( pData->size >= IOVEC_MEM_LIMIT ) {
/* as we dont have to copy any data, we can simply fill the iovecs
* with data from the user data description.
*/
for( index = iov_count; (i < pConv->count) && (index < (*out_size));
i++, index++ ) {
if( remaining < pData->size ) {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = remaining;
remaining = 0;
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
break;
} else {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = pData->size;
user_memory += extent;
COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv );
}
remaining -= iov[index].iov_len;
pConv->bConverted += iov[index].iov_len;
}
*out_size = index;
*max_data = (pConv->bConverted - initial_bytes_converted);
if( pConv->bConverted == pConv->local_size ) {
pConv->flags |= CONVERTOR_COMPLETED;
return 1;
}
return 0;
}
}
{
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); );
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */ length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */
/* data left from last round and enough space in the buffer */ /* data left from last round and enough space in the buffer */
if( (0 != length) && (length <= remaining)) { if( (pData->size != length) && (length <= remaining)) {
/* copy the partial left-over from the previous round */ /* copy the partial left-over from the previous round */
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "2. pack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [prologue]\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); (void*)user_memory, (void*)packed_buffer, length ); );
MEMCPY_CSUM( packed_buffer, user_memory, length, pConv ); MEMCPY_CSUM( packed_buffer, user_memory, length, pConv );
packed_buffer += length; packed_buffer += length;
user_memory += (extent - pData->size + length);
remaining -= length; remaining -= length;
stack[1].count -= length; stack[1].count -= length;
stack[1].disp += length; /* just in case, we overwrite this below */
if( 0 == stack[1].count) { /* one completed element */ if( 0 == stack[1].count) { /* one completed element */
stack[0].count--; stack[0].count--;
stack[0].disp += extent; stack[0].disp += extent;
if( 0 != stack[0].count ) { /* not yet done */ if( 0 == stack[0].count ) /* not yet done */
break;
stack[1].count = pData->size; stack[1].count = pData->size;
stack[1].disp = 0; stack[1].disp = 0;
} }
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
} }
}
for( i = 0; pData->size <= remaining; i++ ) { for( i = 0; pData->size <= remaining; i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "3. pack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); ); (void*)user_memory, (void*)packed_buffer, pData->size, remaining, iov[idx].iov_len ); );
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv ); MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
packed_buffer += pData->size; packed_buffer += pData->size;
user_memory += extent; user_memory += extent;
remaining -= pData->size; remaining -= pData->size;
} }
stack[0].count -= i; /* the filled up and the entire types */ stack[0].count -= i; /* the entire datatype copied above */
stack[0].disp += (i * extent); stack[0].disp += (i * extent);
stack[1].disp += remaining;
/* Copy the last bits */ /* Copy the last bits */
if( 0 != remaining ) { if( 0 != remaining ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %" PRIsize_t "\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); (void*)user_memory, (void*)packed_buffer, remaining ); );
MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv ); MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv );
user_memory += remaining;
stack[1].count -= remaining; stack[1].count -= remaining;
} stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */
if( 0 == stack[1].count ) { /* prepare for the next element */ if( 0 == stack[1].count ) { /* prepare for the next element */
stack[1].count = pData->size; stack[1].count = pData->size;
stack[1].disp = 0; stack[1].disp = 0;
} }
} }
pConv->bConverted += bConverted;
} }
*out_size = iov_count;
*max_data = (pConv->bConverted - initial_bytes_converted); update_status_and_return:
if( pConv->bConverted == pConv->local_size ) { *out_size = idx;
pConv->flags |= CONVERTOR_COMPLETED; *max_data = pConv->bConverted - initial_bytes_converted;
return 1; if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED;
} return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */
return 0;
} }
/* The pack/unpack functions need a cleanup. I have to create a proper interface to access /* The pack/unpack functions need a cleanup. I have to create a proper interface to access
@ -314,18 +272,32 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_ptr = (unsigned char *) iov[iov_count].iov_base;
iov_len_local = iov[iov_count].iov_len; iov_len_local = iov[iov_count].iov_len;
while( 1 ) {
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
/* now here we have a basic datatype */ if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) {
PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, /* we have a partial (less than blocklen) basic datatype */
int rc = PACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc,
conv_ptr, iov_ptr, iov_len_local ); conv_ptr, iov_ptr, iov_len_local );
if( 0 == count_desc ) { /* completed */ if( 0 == rc ) /* not done */
goto complete_loop;
if( 0 == count_desc ) {
conv_ptr = pConvertor->pBaseBuf + pStack->disp; conv_ptr = pConvertor->pBaseBuf + pStack->disp;
pos_desc++; /* advance to the next data */ pos_desc++; /* advance to the next data */
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
continue;
} }
}
}
while( 1 ) {
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
/* we have a basic datatype (working on full blocks) */
PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
conv_ptr, iov_ptr, iov_len_local );
if( 0 != count_desc ) /* completed? */
goto complete_loop; goto complete_loop;
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
pos_desc++; /* advance to the next data */
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
} }
if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
DO_DEBUG( opal_output( 0, "pack end_loop count %" PRIsize_t " stack_pos %d" DO_DEBUG( opal_output( 0, "pack end_loop count %" PRIsize_t " stack_pos %d"

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; -*- */
/* /*
* Copyright (c) 2004-2009 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
@ -19,8 +19,6 @@
#include "opal_config.h" #include "opal_config.h"
#include <stddef.h>
#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT #if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
/* Make use of existing macro to do CUDA style memcpy */ /* Make use of existing macro to do CUDA style memcpy */
#undef MEMCPY_CSUM #undef MEMCPY_CSUM
@ -28,88 +26,181 @@
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif #endif
static inline void pack_predefined_data( opal_convertor_t* CONVERTOR, /**
* This function deals only with partial elements. The COUNT points however to the whole leftover count,
* but this function is only expected to operate on an amount less than blength, that would allow the rest
* of the pack process to handle only entire blength blocks (plus the left over).
*
* Return 1 if we are now aligned on a block, 0 otherwise.
*/
static inline int
pack_partial_blocklen( opal_convertor_t* CONVERTOR,
const dt_elem_desc_t* ELEM, const dt_elem_desc_t* ELEM,
size_t* COUNT, size_t* COUNT,
unsigned char** SOURCE, unsigned char** memory,
unsigned char** DESTINATION, unsigned char** packed,
size_t* SPACE ) size_t* SPACE )
{ {
size_t _copy_count = *(COUNT);
size_t _copy_blength;
const ddt_elem_desc_t* _elem = &((ELEM)->elem); const ddt_elem_desc_t* _elem = &((ELEM)->elem);
unsigned char* _source = (*SOURCE) + _elem->disp; size_t do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now = *(COUNT);
unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;
_copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; assert( *(COUNT) <= _elem->count * _elem->blocklen);
if( (_copy_count * _copy_blength) > *(SPACE) ) {
_copy_count = (*(SPACE) / _copy_blength); /**
if( 0 == _copy_count ) return; /* nothing to do */ * First check if we already did something on this element ? The COUNT is the number
* of remaining predefined types in the current elem, not how many predefined types
* should be manipulated in the current call (this number is instead reflected on the
* SPACE).
*/
if( 0 == (do_now = (*COUNT) % _elem->blocklen) )
return 1;
size_t left_in_block = do_now; /* left in the current blocklen */
if( (do_now_bytes * do_now) > *(SPACE) )
do_now = (*SPACE) / do_now_bytes;
do_now_bytes *= do_now;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [partial]\n",
_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) );
*(memory) += (ptrdiff_t)do_now_bytes;
if( do_now == left_in_block ) /* compensate if completed a blocklen */
*(memory) += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size);
*(COUNT) -= do_now;
*(SPACE) -= do_now_bytes;
*(packed) += do_now_bytes;
return (do_now == left_in_block);
}
/**
* Pack entire blocks, plus a possible remainder if SPACE is constrained to less than COUNT elements.
*/
static inline void
pack_predefined_data( opal_convertor_t* CONVERTOR,
const dt_elem_desc_t* ELEM,
size_t* COUNT,
unsigned char** memory,
unsigned char** packed,
size_t* SPACE )
{
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t cando_count = *(COUNT), do_now_bytes;
unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;
assert( 0 == (cando_count % _elem->blocklen) ); /* no partials here */
assert( *(COUNT) <= _elem->count * _elem->blocklen);
if( (blocklen_bytes * cando_count) > *(SPACE) )
cando_count = (*SPACE) / blocklen_bytes;
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
for(; cando_count > 0; cando_count--) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n",
(void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) );
_packed += blocklen_bytes;
_memory += _elem->extent;
}
goto update_and_return;
} }
if( (ptrdiff_t)_copy_blength == _elem->extent ) { if( (1 < _elem->count) && (_elem->blocklen <= cando_count) ) {
_copy_blength *= _copy_count; blocklen_bytes *= _elem->blocklen;
/* the extent and the size of the basic datatype are equal */
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (CONVERTOR)->pBaseBuf, do { /* Do as many full blocklen as possible */
(CONVERTOR)->pDesc, (CONVERTOR)->count ); OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)*(DESTINATION), (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); );
MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) );
_source += _copy_blength;
*(DESTINATION) += _copy_blength;
} else {
for(size_t _i = 0; _i < _copy_count; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n", DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)*(DESTINATION), (void*)_source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); (void*)_packed, (void*)_memory, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) ); MEMCPY_CSUM( _packed, _memory, blocklen_bytes, (CONVERTOR) );
*(DESTINATION) += _copy_blength; _packed += blocklen_bytes;
_source += _elem->extent; _memory += _elem->extent;
cando_count -= _elem->blocklen;
} while (_elem->blocklen <= cando_count);
} }
_copy_blength *= _copy_count;
/**
* As an epilog do anything left from the last blocklen.
*/
if( 0 != cando_count ) {
assert( (cando_count < _elem->blocklen) ||
((1 == _elem->count) && (cando_count <= _elem->blocklen)) );
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n",
(void*)_packed, (void*)_memory, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( _packed, _memory, do_now_bytes, (CONVERTOR) );
_memory += do_now_bytes;
_packed += do_now_bytes;
} }
*(SOURCE) = _source - _elem->disp;
*(SPACE) -= _copy_blength; update_and_return:
*(COUNT) -= _copy_count; *(memory) = _memory - _elem->disp;
*(SPACE) -= (_packed - *packed);
*(packed) = _packed;
} }
static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR, static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR,
const dt_elem_desc_t* ELEM, const dt_elem_desc_t* ELEM,
size_t* COUNT, size_t* COUNT,
unsigned char** SOURCE, unsigned char** memory,
unsigned char** DESTINATION, unsigned char** packed,
size_t* SPACE ) size_t* SPACE )
{ {
const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp; unsigned char* _memory = (*memory) + _end_loop->first_elem_disp;
size_t _copy_loops = *(COUNT); size_t _copy_loops = *(COUNT);
if( (_copy_loops * _end_loop->size) > *(SPACE) ) if( (_copy_loops * _end_loop->size) > *(SPACE) )
_copy_loops = (*(SPACE) / _end_loop->size); _copy_loops = (*(SPACE) / _end_loop->size);
for(size_t _i = 0; _i < _copy_loops; _i++ ) { for(size_t _i = 0; _i < _copy_loops; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _end_loop->size, (CONVERTOR)->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, _end_loop->size, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu\n", DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)*(DESTINATION), (void*)_source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); (void*)*(packed), (void*)_memory, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); );
MEMCPY_CSUM( *(DESTINATION), _source, _end_loop->size, (CONVERTOR) ); MEMCPY_CSUM( *(packed), _memory, _end_loop->size, (CONVERTOR) );
*(DESTINATION) += _end_loop->size; *(packed) += _end_loop->size;
_source += _loop->extent; _memory += _loop->extent;
} }
*(SOURCE) = _source - _end_loop->first_elem_disp; *(memory) = _memory - _end_loop->first_elem_disp;
*(SPACE) -= _copy_loops * _end_loop->size; *(SPACE) -= _copy_loops * _end_loop->size;
*(COUNT) -= _copy_loops; *(COUNT) -= _copy_loops;
} }
#define PACK_PARTIAL_BLOCKLEN( CONVERTOR, /* the convertor */ \
ELEM, /* the basic element to be packed */ \
COUNT, /* the number of elements */ \
MEMORY, /* the source pointer (char*) */ \
PACKED, /* the destination pointer (char*) */ \
SPACE ) /* the space in the destination buffer */ \
pack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) )
#define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \ #define PACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \
ELEM, /* the basic element to be packed */ \ ELEM, /* the basic element to be packed */ \
COUNT, /* the number of elements */ \ COUNT, /* the number of elements */ \
SOURCE, /* the source pointer (char*) */ \ MEMORY, /* the source pointer (char*) */ \
DESTINATION, /* the destination pointer (char*) */ \ PACKED, /* the destination pointer (char*) */ \
SPACE ) /* the space in the destination buffer */ \ SPACE ) /* the space in the destination buffer */ \
pack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) pack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) )
#define PACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \ #define PACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, MEMORY, PACKED, SPACE ) \
pack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) pack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(MEMORY), &(PACKED), &(SPACE) )
#endif /* OPAL_DATATYPE_PACK_H_HAS_BEEN_INCLUDED */ #endif /* OPAL_DATATYPE_PACK_H_HAS_BEEN_INCLUDED */

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -49,10 +49,24 @@
* - the DT_CONTIGUOUS flag for the type OPAL_DATATYPE_END_LOOP is meaningless. * - the DT_CONTIGUOUS flag for the type OPAL_DATATYPE_END_LOOP is meaningless.
*/ */
static inline void
position_single_block(opal_convertor_t* CONVERTOR,
unsigned char** mem, ptrdiff_t mem_update,
size_t* space, size_t space_update,
size_t* cnt, size_t cnt_update)
{
OPAL_DATATYPE_SAFEGUARD_POINTER( *mem, mem_update, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "position( %p, %lu ) => space %lu [prolog]\n",
(void*)*mem, (unsigned long)space_update, (unsigned long)(*space) ); );
*mem += mem_update;
*space -= space_update;
*cnt -= cnt_update;
}
/** /**
* Advance the current position in the convertor based using the * Advance the convertors' position according. Update the pointer and the remaining space
* current element and a left-over counter. Update the head pointer * accordingly.
* and the leftover byte space.
*/ */
static inline void static inline void
position_predefined_data( opal_convertor_t* CONVERTOR, position_predefined_data( opal_convertor_t* CONVERTOR,
@ -61,66 +75,92 @@ position_predefined_data( opal_convertor_t* CONVERTOR,
unsigned char** POINTER, unsigned char** POINTER,
size_t* SPACE ) size_t* SPACE )
{ {
size_t _copy_count = *(COUNT); const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t _copy_blength; size_t total_count = _elem->count * _elem->blocklen;
ddt_elem_desc_t* _elem = &((ELEM)->elem); size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now, do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
unsigned char* _memory = (*POINTER) + _elem->disp;
_copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; assert( *(COUNT) <= _elem->count * _elem->blocklen);
if( (_copy_count * _copy_blength) > *(SPACE) ) {
_copy_count = *(SPACE) / _copy_blength; if( cando_count > *(COUNT) )
if( 0 == _copy_count ) return; /* nothing to do */ cando_count = *(COUNT);
if( 1 == _elem->blocklen ) {
DO_DEBUG( opal_output( 0, "position( %p, %" PRIsize_t " ) x (count %" PRIsize_t ", extent %ld) => space %lu [prolog]\n",
(void*)_memory, (unsigned long)do_now_bytes, cando_count, _elem->extent, (unsigned long)(*SPACE) ); );
_memory += cando_count * _elem->extent;
*SPACE -= cando_count * do_now_bytes;
*COUNT -= cando_count;
goto update_and_return;
} }
_copy_blength *= _copy_count;
OPAL_DATATYPE_SAFEGUARD_POINTER( *(POINTER) + _elem->disp, _copy_blength, (CONVERTOR)->pBaseBuf, /**
(CONVERTOR)->pDesc, (CONVERTOR)->count ); * First check if we already did something on this element ?
*(POINTER) += (_copy_count * _elem->extent);
*(SPACE) -= _copy_blength;
*(COUNT) -= _copy_count;
}
/**
* Advance the current position in the convertor based using the
* current contiguous loop and a left-over counter. Update the head
* pointer and the leftover byte space.
*/ */
static inline void do_now = (total_count - *(COUNT)); /* done elements */
position_contiguous_loop( opal_convertor_t* CONVERTOR, if( 0 != do_now ) {
dt_elem_desc_t* ELEM, do_now = do_now % _elem->blocklen; /* partial blocklen? */
size_t* COUNT,
unsigned char** POINTER,
size_t* SPACE )
{
ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + (ELEM)->loop.items);
size_t _copy_loops = *(COUNT);
if( (_copy_loops * _end_loop->size) > *(SPACE) ) if( 0 != do_now ) {
_copy_loops = *(SPACE) / _end_loop->size; size_t left_in_block = _elem->blocklen - do_now; /* left in the current blocklen */
OPAL_DATATYPE_SAFEGUARD_POINTER( *(POINTER) + _end_loop->first_elem_disp, do_now = (left_in_block > cando_count ) ? cando_count : left_in_block;
(_copy_loops - 1) * _loop->extent + _end_loop->size, do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
(CONVERTOR)->pBaseBuf, (CONVERTOR)->pDesc, (CONVERTOR)->count );
*(POINTER) += _copy_loops * _loop->extent; position_single_block( CONVERTOR, &_memory, do_now_bytes,
*(SPACE) -= _copy_loops * _end_loop->size; SPACE, do_now_bytes, COUNT, do_now );
*(COUNT) -= _copy_loops;
/* compensate if we just completed a blocklen */
if( do_now == left_in_block )
_memory += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size);
cando_count -= do_now;
}
}
/**
* Compute how many full blocklen we need to do and do them.
*/
do_now = cando_count / _elem->blocklen;
if( 0 != do_now ) {
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
#if OPAL_ENABLE_DEBUG
for(size_t _i = 0; _i < do_now; _i++ ) {
position_single_block( CONVERTOR, &_memory, _elem->extent,
SPACE, do_now_bytes, COUNT, _elem->blocklen );
cando_count -= _elem->blocklen;
}
#else
_memory += do_now * _elem->extent;
*SPACE -= do_now * do_now_bytes;
*COUNT -= do_now * _elem->blocklen;
cando_count -= do_now * _elem->blocklen;
#endif /* OPAL_ENABLE_DEBUG */
}
/**
* As an epilog do anything left from the last blocklen.
*/
do_now = cando_count;
if( 0 != do_now ) {
do_now_bytes = do_now * opal_datatype_basicDatatypes[_elem->common.type]->size;
position_single_block( CONVERTOR, &_memory, do_now_bytes,
SPACE, do_now_bytes, COUNT, do_now );
}
update_and_return:
*(POINTER) = _memory - _elem->disp;
} }
#define POSITION_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, POSITION, SPACE ) \
position_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(POSITION), &(SPACE) )
#define POSITION_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, POSITION, SPACE ) \
position_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(POSITION), &(SPACE) )
int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor, int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
size_t* position ) size_t* position )
{ {
dt_stack_t* pStack; /* pointer to the position on the stack */ dt_stack_t* pStack; /* pointer to the position on the stack */
uint32_t pos_desc; /* actual position in the description of the derived datatype */ uint32_t pos_desc; /* actual position in the description of the derived datatype */
size_t count_desc; /* the number of items already done in the actual pos_desc */ size_t count_desc; /* the number of items already done in the actual pos_desc */
size_t iov_len_local;
dt_elem_desc_t* description = pConvertor->use_desc->desc; dt_elem_desc_t* description = pConvertor->use_desc->desc;
dt_elem_desc_t* pElem; /* current position */ dt_elem_desc_t* pElem; /* current position */
unsigned char *base_pointer = pConvertor->pBaseBuf; unsigned char *base_pointer = pConvertor->pBaseBuf;
size_t iov_len_local;
ptrdiff_t extent = pConvertor->pDesc->ub - pConvertor->pDesc->lb; ptrdiff_t extent = pConvertor->pDesc->ub - pConvertor->pDesc->lb;
DUMP( "opal_convertor_generic_simple_position( %p, &%ld )\n", (void*)pConvertor, (long)*position ); DUMP( "opal_convertor_generic_simple_position( %p, &%ld )\n", (void*)pConvertor, (long)*position );
@ -128,8 +168,8 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
/* We dont want to have to parse the datatype multiple times. What we are interested in /* We dont want to have to parse the datatype multiple times. What we are interested in
* here is to compute the number of completed datatypes that we can move forward, update * here is to compute the number of completed datatypes that we can move forward, update
* the counters and finally compute the position taking in account only the remaining * the counters and compute the position taking in account only the remaining elements.
* elements. The only problem is that we have to modify all the elements on the stack. * The only problem is that we have to modify all the elements on the stack.
*/ */
iov_len_local = *position - pConvertor->bConverted; iov_len_local = *position - pConvertor->bConverted;
if( iov_len_local > pConvertor->pDesc->size ) { if( iov_len_local > pConvertor->pDesc->size ) {
@ -171,21 +211,19 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
assert(pConvertor->partial_length < element_length); assert(pConvertor->partial_length < element_length);
return 0; return 0;
} }
pConvertor->partial_length = (pConvertor->partial_length + missing_length) % element_length; pConvertor->partial_length = 0;
assert(pConvertor->partial_length == 0);
pConvertor->bConverted += missing_length; pConvertor->bConverted += missing_length;
iov_len_local -= missing_length; iov_len_local -= missing_length;
count_desc--; count_desc--;
} }
while( 1 ) { while( 1 ) {
if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the the entire datatype */
DO_DEBUG( opal_output( 0, "position end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %lx space %lu\n", DO_DEBUG( opal_output( 0, "position end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %lx space %lu\n",
pStack->count, pConvertor->stack_pos, pos_desc, pStack->count, pConvertor->stack_pos, pos_desc,
pStack->disp, (unsigned long)iov_len_local ); ); pStack->disp, (unsigned long)iov_len_local ); );
if( --(pStack->count) == 0 ) { /* end of loop */ if( --(pStack->count) == 0 ) { /* end of loop */
if( pConvertor->stack_pos == 0 ) { if( pConvertor->stack_pos == 0 ) {
pConvertor->flags |= CONVERTOR_COMPLETED; pConvertor->flags |= CONVERTOR_COMPLETED;
pConvertor->partial_length = 0;
goto complete_loop; /* completed */ goto complete_loop; /* completed */
} }
pConvertor->stack_pos--; pConvertor->stack_pos--;
@ -194,11 +232,13 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
} else { } else {
if( pStack->index == -1 ) { if( pStack->index == -1 ) {
pStack->disp += extent; pStack->disp += extent;
pos_desc = 0; /* back to the first element */
} else { } else {
assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type ); assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
pStack->disp += description[pStack->index].loop.extent; pStack->disp += description[pStack->index].loop.extent;
pos_desc = pStack->index; /* go back to the loop start itself to give a chance
* to move forward by entire loops */
} }
pos_desc = pStack->index + 1;
} }
base_pointer = pConvertor->pBaseBuf + pStack->disp; base_pointer = pConvertor->pBaseBuf + pStack->disp;
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
@ -208,9 +248,14 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
} }
if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
ptrdiff_t local_disp = (ptrdiff_t)base_pointer; ptrdiff_t local_disp = (ptrdiff_t)base_pointer;
if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { ddt_endloop_desc_t* end_loop = (ddt_endloop_desc_t*)(pElem + pElem->loop.items);
POSITION_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, size_t full_loops = iov_len_local / end_loop->size;
base_pointer, iov_len_local ); full_loops = count_desc <= full_loops ? count_desc : full_loops;
if( full_loops ) {
base_pointer += full_loops * pElem->loop.extent;
iov_len_local -= full_loops * end_loop->size;
count_desc -= full_loops;
if( 0 == count_desc ) { /* completed */ if( 0 == count_desc ) { /* completed */
pos_desc += pElem->loop.items + 1; pos_desc += pElem->loop.items + 1;
goto update_loop_description; goto update_loop_description;
@ -232,8 +277,7 @@ int opal_convertor_generic_simple_position( opal_convertor_t* pConvertor,
} }
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
/* now here we have a basic datatype */ /* now here we have a basic datatype */
POSITION_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, position_predefined_data( pConvertor, pElem, &count_desc, &base_pointer, &iov_len_local );
base_pointer, iov_len_local );
if( 0 != count_desc ) { /* completed */ if( 0 != count_desc ) { /* completed */
pConvertor->partial_length = iov_len_local; pConvertor->partial_length = iov_len_local;
goto complete_loop; goto complete_loop;

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2017 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -70,98 +70,82 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
{ {
const opal_datatype_t *pData = pConv->pDesc; const opal_datatype_t *pData = pConv->pDesc;
unsigned char *user_memory, *packed_buffer; unsigned char *user_memory, *packed_buffer;
uint32_t iov_count, i; uint32_t iov_idx, i;
size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted; size_t remaining, initial_bytes_converted = pConv->bConverted;
dt_stack_t* stack = pConv->pStack; dt_stack_t* stack = pConv->pStack;
ptrdiff_t extent = pData->ub - pData->lb; ptrdiff_t extent = pData->ub - pData->lb;
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n", DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( pBaseBuf %p, iov count %d )\n",
(void*)pConv->pBaseBuf, *out_size ); ); (void*)pConv->pBaseBuf, *out_size ); );
if( stack[1].type != opal_datatype_uint1.id ) { if( stack[1].type != opal_datatype_uint1.id ) {
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size; stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
stack[1].type = opal_datatype_uint1.id; stack[1].type = opal_datatype_uint1.id;
} }
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
remaining = pConv->local_size - pConv->bConverted;
if( 0 == remaining ) break; /* we're done this time */
if( remaining > iov[iov_count].iov_len )
remaining = iov[iov_count].iov_len;
packed_buffer = (unsigned char*)iov[iov_count].iov_base;
bConverted = remaining; /* how much will get unpacked this time */
user_memory = pConv->pBaseBuf + initial_displ;
if( (ptrdiff_t)pData->size == extent ) { if( (ptrdiff_t)pData->size == extent ) {
user_memory += pConv->bConverted; for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) {
DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", remaining = pConv->local_size - pConv->bConverted;
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); if( 0 == remaining ) break; /* we're done this time */
if( remaining > iov[iov_idx].iov_len )
remaining = iov[iov_idx].iov_len;
packed_buffer = (unsigned char*)iov[iov_idx].iov_base;
user_memory = pConv->pBaseBuf + pData->true_lb + pConv->bConverted;
/* contiguous data or basic datatype with count */ /* contiguous data or basic datatype with count */
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining,
pConv->pBaseBuf, pData, pConv->count ); pConv->pBaseBuf, pData, pConv->count );
DO_DEBUG( opal_output( 0, "1. unpack contig dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "unpack contig [%d] dest %p src %p length %" PRIsize_t "\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); );
MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv );
pConv->bConverted += remaining; /* how much will get unpacked this time */
}
} else { } else {
user_memory += stack[0].disp + stack[1].disp; for( iov_idx = 0; iov_idx < (*out_size); iov_idx++ ) {
remaining = pConv->local_size - pConv->bConverted;
if( 0 == remaining ) break; /* we're done this time */
if( remaining > iov[iov_idx].iov_len )
remaining = iov[iov_idx].iov_len;
DO_DEBUG( opal_output( 0, "unpack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n", packed_buffer = (unsigned char*)iov[iov_idx].iov_base;
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
pConv->bConverted += remaining; /* how much will get unpacked this time */
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last unpack */ for( i = 0; stack[1].count <= remaining; i++ ) { /* partial or full data */
/* complete the last copy */ OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, stack[1].count, pConv->pBaseBuf,
if( (0 != length) && (length <= remaining) ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "2. unpack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [%d]\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)length ); ); iov_idx, (void*)user_memory, (void*)packed_buffer, stack[1].count, i ); );
MEMCPY_CSUM( user_memory, packed_buffer, length, pConv ); MEMCPY_CSUM( user_memory, packed_buffer, stack[1].count, pConv );
packed_buffer += length;
user_memory += (extent - (pData->size - length)); packed_buffer += stack[1].count;
remaining -= length; remaining -= stack[1].count;
stack[1].count -= length;
if( 0 == stack[1].count) { /* one completed element */
stack[0].count--; stack[0].count--;
stack[0].disp += extent; stack[0].disp += extent;
if( 0 != stack[0].count ) { /* not yet done */
stack[1].count = pData->size; stack[1].count = pData->size;
stack[1].disp = 0; stack[1].disp = 0;
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp;
} }
}
} /* Copy the last bits */
for( i = 0; pData->size <= remaining; i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "3. unpack dest %p src %p length %lu\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); );
MEMCPY_CSUM( user_memory, packed_buffer, pData->size, pConv );
packed_buffer += pData->size;
user_memory += extent;
remaining -= pData->size;
}
stack[0].count -= i;
stack[0].disp += (i * extent);
stack[1].disp += remaining;
/* copy the last bits */
if( 0 != remaining ) { if( 0 != remaining ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
pData, pConv->count ); pData, pConv->count );
DO_DEBUG( opal_output( 0, "4. unpack dest %p src %p length %lu\n", DO_DEBUG( opal_output( 0, "unpack gaps [%d] dest %p src %p length %" PRIsize_t " [epilog]\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); ); iov_idx, (void*)user_memory, (void*)packed_buffer, remaining ); );
MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv ); MEMCPY_CSUM( user_memory, packed_buffer, remaining, pConv );
user_memory += remaining;
stack[1].count -= remaining; stack[1].count -= remaining;
stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */
assert( stack[1].count );
} }
} }
pConv->bConverted += bConverted;
} }
*out_size = iov_count; /* we only reach this line after the for loop succesfully complete */ *out_size = iov_idx; /* we only reach this line after the for loop succesfully complete */
*max_data = (pConv->bConverted - initial_bytes_converted); *max_data = pConv->bConverted - initial_bytes_converted;
if( pConv->bConverted == pConv->local_size ) { if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED;
pConv->flags |= CONVERTOR_COMPLETED; return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */
return 1;
}
return 0;
} }
/** /**
@ -179,7 +163,7 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
static inline void static inline void
opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pElem, opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pElem,
unsigned char* partial_data, unsigned char* partial_data,
ptrdiff_t start_position, ptrdiff_t length, ptrdiff_t start_position, size_t length,
unsigned char** user_buffer ) unsigned char** user_buffer )
{ {
char unused_byte = 0x7F, saved_data[16]; char unused_byte = 0x7F, saved_data[16];
@ -195,7 +179,7 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
/* Find a byte that is not used in the partial buffer */ /* Find a byte that is not used in the partial buffer */
find_unused_byte: find_unused_byte:
for(ptrdiff_t i = 0; i < length; i++ ) { for(size_t i = 0; i < length; i++ ) {
if( unused_byte == partial_data[i] ) { if( unused_byte == partial_data[i] ) {
unused_byte--; unused_byte--;
goto find_unused_byte; goto find_unused_byte;
@ -298,6 +282,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) { for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
iov_ptr = (unsigned char *) iov[iov_count].iov_base; iov_ptr = (unsigned char *) iov[iov_count].iov_base;
iov_len_local = iov[iov_count].iov_len; iov_len_local = iov[iov_count].iov_len;
if( 0 != pConvertor->partial_length ) { if( 0 != pConvertor->partial_length ) {
size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size; size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
size_t missing_length = element_length - pConvertor->partial_length; size_t missing_length = element_length - pConvertor->partial_length;
@ -306,7 +291,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
COMPUTE_CSUM( iov_ptr, missing_length, pConvertor ); COMPUTE_CSUM( iov_ptr, missing_length, pConvertor );
opal_unpack_partial_datatype( pConvertor, pElem, opal_unpack_partial_datatype( pConvertor, pElem,
iov_ptr, iov_ptr,
pConvertor->partial_length, element_length - pConvertor->partial_length, pConvertor->partial_length, (size_t)(element_length - pConvertor->partial_length),
&conv_ptr ); &conv_ptr );
--count_desc; --count_desc;
if( 0 == count_desc ) { if( 0 == count_desc ) {
@ -318,34 +303,31 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
iov_len_local -= missing_length; iov_len_local -= missing_length;
pConvertor->partial_length = 0; /* nothing more inside */ pConvertor->partial_length = 0; /* nothing more inside */
} }
while( 1 ) { if( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) { if( (pElem->elem.count * pElem->elem.blocklen) != count_desc ) {
/* now here we have a basic datatype */ /* we have a partial (less than blocklen) basic datatype */
UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc, int rc = UNPACK_PARTIAL_BLOCKLEN( pConvertor, pElem, count_desc,
iov_ptr, conv_ptr, iov_len_local ); iov_ptr, conv_ptr, iov_len_local );
if( 0 == count_desc ) { /* completed */ if( 0 == rc ) /* not done */
goto complete_loop;
if( 0 == count_desc ) {
conv_ptr = pConvertor->pBaseBuf + pStack->disp; conv_ptr = pConvertor->pBaseBuf + pStack->disp;
pos_desc++; /* advance to the next data */ pos_desc++; /* advance to the next data */
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
continue;
} }
assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
if( 0 != iov_len_local ) {
unsigned char* temp = conv_ptr;
/* We have some partial data here. Let's copy it into the convertor
* and keep it hot until the next round.
*/
assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
opal_unpack_partial_datatype( pConvertor, pElem,
iov_ptr, 0, iov_len_local,
&temp );
pConvertor->partial_length = iov_len_local;
iov_len_local = 0;
} }
}
while( 1 ) {
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
/* we have a basic datatype (working on full blocks) */
UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
iov_ptr, conv_ptr, iov_len_local );
if( 0 != count_desc ) /* completed? */
goto complete_loop; goto complete_loop;
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
pos_desc++; /* advance to the next data */
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
} }
if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
DO_DEBUG( opal_output( 0, "unpack end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n", DO_DEBUG( opal_output( 0, "unpack end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n",
@ -353,11 +335,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
pStack->disp, (unsigned long)iov_len_local ); ); pStack->disp, (unsigned long)iov_len_local ); );
if( --(pStack->count) == 0 ) { /* end of loop */ if( --(pStack->count) == 0 ) { /* end of loop */
if( 0 == pConvertor->stack_pos ) { if( 0 == pConvertor->stack_pos ) {
/* Do the same thing as when the loop is completed */ /* we're done. Force the exit of the main for loop (around iovec) */
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ *out_size = iov_count;
total_unpacked += iov[iov_count].iov_len; goto complete_loop;
iov_count++; /* go to the next */
goto complete_conversion;
} }
pConvertor->stack_pos--; pConvertor->stack_pos--;
pStack--; pStack--;
@ -396,14 +376,29 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
conv_ptr = pConvertor->pBaseBuf + pStack->disp; conv_ptr = pConvertor->pBaseBuf + pStack->disp;
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" ); DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
continue;
} }
} }
complete_loop: complete_loop:
assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
if( 0 != iov_len_local ) {
unsigned char* temp = conv_ptr;
/* We have some partial data here. Let's copy it into the convertor
* and keep it hot until the next round.
*/
assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
opal_unpack_partial_datatype( pConvertor, pElem,
iov_ptr, 0, iov_len_local,
&temp );
pConvertor->partial_length = iov_len_local;
iov_len_local = 0;
}
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
total_unpacked += iov[iov_count].iov_len; total_unpacked += iov[iov_count].iov_len;
} }
complete_conversion:
*max_data = total_unpacked; *max_data = total_unpacked;
pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ pConvertor->bConverted += total_unpacked; /* update the already converted bytes */
*out_size = iov_count; *out_size = iov_count;
@ -530,11 +525,9 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
pStack->disp, (unsigned long)iov_len_local ); ); pStack->disp, (unsigned long)iov_len_local ); );
if( --(pStack->count) == 0 ) { /* end of loop */ if( --(pStack->count) == 0 ) { /* end of loop */
if( 0 == pConvertor->stack_pos ) { if( 0 == pConvertor->stack_pos ) {
/* Do the same thing as when the loop is completed */ /* we're done. Force the exit of the main for loop (around iovec) */
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ *out_size = iov_count;
total_unpacked += iov[iov_count].iov_len; goto complete_loop;
iov_count++; /* go to the next */
goto complete_conversion;
} }
pConvertor->stack_pos--; pConvertor->stack_pos--;
pStack--; pStack--;
@ -568,7 +561,6 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */ iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
total_unpacked += iov[iov_count].iov_len; total_unpacked += iov[iov_count].iov_len;
} }
complete_conversion:
*max_data = total_unpacked; *max_data = total_unpacked;
pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ pConvertor->bConverted += total_unpacked; /* update the already converted bytes */
*out_size = iov_count; *out_size = iov_count;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; -*- */
/* /*
* Copyright (c) 2004-2009 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
@ -26,84 +26,178 @@
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif #endif
static inline void /**
unpack_predefined_data( opal_convertor_t* CONVERTOR, /* the convertor */ * This function deals only with partial elements. The COUNT points however to the whole leftover count,
const dt_elem_desc_t* ELEM, /* the element description */ * but this function is only expected to operate on an amount less than blength, that would allow the rest
size_t* COUNT, /* the number of elements */ * of the pack process to handle only entire blength blocks (plus the left over).
unsigned char** SOURCE, /* the source pointer */ *
unsigned char** DESTINATION, /* the destination pointer */ * Return 1 if we are now aligned on a block, 0 otherwise.
size_t* SPACE ) /* the space in the destination buffer */ */
static inline int
unpack_partial_blocklen( opal_convertor_t* CONVERTOR,
const dt_elem_desc_t* ELEM,
size_t* COUNT,
unsigned char** packed,
unsigned char** memory,
size_t* SPACE )
{ {
size_t _copy_count = *(COUNT);
size_t _copy_blength;
const ddt_elem_desc_t* _elem = &((ELEM)->elem); const ddt_elem_desc_t* _elem = &((ELEM)->elem);
unsigned char* _destination = (*DESTINATION) + _elem->disp; size_t do_now_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now = (*COUNT);
unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;
_copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; assert( *(COUNT) <= (_elem->count * _elem->blocklen));
if( (_copy_count * _copy_blength) > *(SPACE) ) {
_copy_count = (*(SPACE) / _copy_blength); /**
if( 0 == _copy_count ) return; /* nothing to do */ * First check if we already did something on this element ? The COUNT is the number
* of remaining predefined types in the current elem, not how many predefined types
* should be manipulated in the current call (this number is instead reflected on the
* SPACE).
*/
if( 0 == (do_now = (*COUNT) % _elem->blocklen) )
return 1;
size_t left_in_block = do_now; /* left in the current blocklen */
if( (do_now_bytes * do_now) > *(SPACE) )
do_now = (*SPACE) / do_now_bytes;
do_now_bytes *= do_now;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [prolog]\n",
(void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE)) ); );
MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) );
*(memory) += (ptrdiff_t)do_now_bytes;
if( do_now == left_in_block ) /* compensate if completed a blocklen */
*(memory) += _elem->extent - (_elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size);
*(COUNT) -= do_now;
*(SPACE) -= do_now_bytes;
*(packed) += do_now_bytes;
return (do_now == left_in_block);
}
static inline void
unpack_predefined_data( opal_convertor_t* CONVERTOR,
const dt_elem_desc_t* ELEM,
size_t* COUNT,
unsigned char** packed,
unsigned char** memory,
size_t* SPACE )
{
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t cando_count = (*COUNT), do_now_bytes;
unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;
assert( 0 == (cando_count % _elem->blocklen) ); /* no partials here */
assert( *(COUNT) <= (_elem->count * _elem->blocklen));
if( (blocklen_bytes * cando_count) > *(SPACE) )
cando_count = (*SPACE) / blocklen_bytes;
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
for(; cando_count > 0; cando_count--) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "unpack memcpy( %p, %p, %lu ) => space %lu [blen = 1]\n",
(void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) );
_packed += blocklen_bytes;
_memory += _elem->extent;
}
goto update_and_return;
} }
if( (ptrdiff_t)_copy_blength == _elem->extent ) { if( (1 < _elem->count) && (_elem->blocklen <= cando_count) ) {
_copy_blength *= _copy_count; blocklen_bytes *= _elem->blocklen;
/* the extent and the size of the basic datatype are equal */
OPAL_DATATYPE_SAFEGUARD_POINTER( _destination, _copy_blength, (CONVERTOR)->pBaseBuf, do { /* Do as many full blocklen as possible */
(CONVERTOR)->pDesc, (CONVERTOR)->count ); OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)_destination, (void*)*(SOURCE), (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); );
MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) );
*(SOURCE) += _copy_blength;
_destination += _copy_blength;
} else {
for(size_t _i = 0; _i < _copy_count; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _destination, _copy_blength, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n", DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)_destination, (void*)*(SOURCE), (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); (void*)_memory, (void*)_packed, (unsigned long)blocklen_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) ); MEMCPY_CSUM( _memory, _packed, blocklen_bytes, (CONVERTOR) );
*(SOURCE) += _copy_blength; _packed += blocklen_bytes;
_destination += _elem->extent; _memory += _elem->extent;
cando_count -= _elem->blocklen;
} while (_elem->blocklen <= cando_count);
} }
_copy_blength *= _copy_count;
/**
* As an epilog do anything left from the last blocklen.
*/
if( 0 != cando_count ) {
assert( (cando_count < _elem->blocklen) ||
((1 == _elem->count) && (cando_count <= _elem->blocklen)) );
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu [epilog]\n",
(void*)_memory, (void*)_packed, (unsigned long)do_now_bytes, (unsigned long)(*(SPACE) - (_packed - *(packed))) ); );
MEMCPY_CSUM( _memory, _packed, do_now_bytes, (CONVERTOR) );
_memory += do_now_bytes;
_packed += do_now_bytes;
} }
(*DESTINATION) = _destination - _elem->disp;
*(SPACE) -= _copy_blength; update_and_return:
*(COUNT) -= _copy_count; *(memory) = _memory - _elem->disp;
*(SPACE) -= (_packed - *packed);
*(packed) = _packed;
} }
static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR, static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR,
const dt_elem_desc_t* ELEM, const dt_elem_desc_t* ELEM,
size_t* COUNT, size_t* COUNT,
unsigned char** SOURCE, unsigned char** packed,
unsigned char** DESTINATION, unsigned char** memory,
size_t* SPACE ) size_t* SPACE )
{ {
const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); const ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); const ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp; unsigned char* _memory = (*memory) + _end_loop->first_elem_disp;
size_t _copy_loops = *(COUNT); size_t _copy_loops = *(COUNT);
if( (_copy_loops * _end_loop->size) > *(SPACE) ) if( (_copy_loops * _end_loop->size) > *(SPACE) )
_copy_loops = (*(SPACE) / _end_loop->size); _copy_loops = (*(SPACE) / _end_loop->size);
for(size_t _i = 0; _i < _copy_loops; _i++ ) { for(size_t _i = 0; _i < _copy_loops; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _destination, _end_loop->size, (CONVERTOR)->pBaseBuf, OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, _end_loop->size, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count ); (CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu\n", DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu\n",
(void*)_destination, (void*)*(SOURCE), (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); (void*)_memory, (void*)*(packed), (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); );
MEMCPY_CSUM( _destination, *(SOURCE), _end_loop->size, (CONVERTOR) ); MEMCPY_CSUM( _memory, *(packed), _end_loop->size, (CONVERTOR) );
*(SOURCE) += _end_loop->size; *(packed) += _end_loop->size;
_destination += _loop->extent; _memory += _loop->extent;
} }
*(DESTINATION) = _destination - _end_loop->first_elem_disp; *(memory) = _memory - _end_loop->first_elem_disp;
*(SPACE) -= _copy_loops * _end_loop->size; *(SPACE) -= _copy_loops * _end_loop->size;
*(COUNT) -= _copy_loops; *(COUNT) -= _copy_loops;
} }
#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \ #define UNPACK_PARTIAL_BLOCKLEN( CONVERTOR, /* the convertor */ \
unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) ELEM, /* the basic element to be packed */ \
COUNT, /* the number of elements */ \
PACKED, /* the destination pointer (char*) */ \
MEMORY, /* the source pointer (char*) */ \
SPACE ) /* the space in the destination buffer */ \
unpack_partial_blocklen( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) )
#define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \ #define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, /* the convertor */ \
unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) ) ELEM, /* the basic element to be packed */ \
COUNT, /* the number of elements */ \
PACKED, /* the destination pointer (char*) */ \
MEMORY, /* the source pointer (char*) */ \
SPACE ) /* the space in the destination buffer */ \
unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) )
#define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, PACKED, MEMORY, SPACE ) \
unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(PACKED), &(MEMORY), &(SPACE) )
#endif /* OPAL_DATATYPE_UNPACK_H_HAS_BEEN_INCLUDED */ #endif /* OPAL_DATATYPE_UNPACK_H_HAS_BEEN_INCLUDED */

Просмотреть файл

@ -33,9 +33,6 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype,
uint32_t *iovec_count, uint32_t *iovec_count,
int increment) int increment)
{ {
opal_convertor_t *convertor; opal_convertor_t *convertor;
size_t remaining_length = 0; size_t remaining_length = 0;
uint32_t i; uint32_t i;
@ -43,7 +40,6 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype,
struct iovec *temp_iov=NULL; struct iovec *temp_iov=NULL;
size_t temp_data; size_t temp_data;
convertor = opal_convertor_create( opal_local_arch, 0 ); convertor = opal_convertor_create( opal_local_arch, 0 );
if (OMPI_SUCCESS != opal_convertor_prepare_for_send (convertor, if (OMPI_SUCCESS != opal_convertor_prepare_for_send (convertor,
@ -69,10 +65,8 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype,
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
while (0 == opal_convertor_raw(convertor, while (0 == opal_convertor_raw(convertor, temp_iov,
temp_iov, &temp_count, &temp_data)) {
&temp_count,
&temp_data)) {
*iovec_count = *iovec_count + temp_count; *iovec_count = *iovec_count + temp_count;
*iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec)); *iov = (struct iovec *) realloc (*iov, *iovec_count * sizeof(struct iovec));
if (NULL == *iov) { if (NULL == *iov) {
@ -80,7 +74,7 @@ mca_common_ompio_decode_datatype ( ompi_datatype_t *datatype,
free(temp_iov); free(temp_iov);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
for (i=0 ; i<temp_count ; i++) { for (i = 0 ; i < temp_count ; i++) {
(*iov)[i+(*iovec_count-temp_count)].iov_base = temp_iov[i].iov_base; (*iov)[i+(*iovec_count-temp_count)].iov_base = temp_iov[i].iov_base;
(*iov)[i+(*iovec_count-temp_count)].iov_len = temp_iov[i].iov_len; (*iov)[i+(*iovec_count-temp_count)].iov_len = temp_iov[i].iov_len;
} }
@ -342,7 +336,6 @@ int main (int argc, char *argv[]) {
struct iovec * iov_1 = NULL; struct iovec * iov_1 = NULL;
mca_common_ompio_decode_datatype ( datatype, 1, &iov_1, &iovec_count_1, 1); mca_common_ompio_decode_datatype ( datatype, 1, &iov_1, &iovec_count_1, 1);
assert(iovec_count_300 == iovec_count_10); assert(iovec_count_300 == iovec_count_10);
assert(iovec_count_300 == iovec_count_1); assert(iovec_count_300 == iovec_count_1);
// assert(iov[100].iov_base == iov2[100].iov_base); // assert(iov[100].iov_base == iov2[100].iov_base);

Просмотреть файл

@ -159,8 +159,7 @@ static int local_copy_ddt_count( opal_datatype_t const * const pdt, int count )
osrc = (char*)malloc( malloced_size ); osrc = (char*)malloc( malloced_size );
{ {
for( size_t i = 0; i < malloced_size; i++ ) for( size_t i = 0; i < malloced_size; i++ ) osrc[i] = i % 128 + 32;
osrc[i] = i % 128 + 32;
memcpy(odst, osrc, malloced_size); memcpy(odst, osrc, malloced_size);
} }
pdst = odst - lb; pdst = odst - lb;

Просмотреть файл

@ -445,7 +445,7 @@ static int32_t opal_datatype_create_vector( int count, int bLength, int stride,
} }
pData = opal_datatype_create( oldType->desc.used + 2 ); pData = opal_datatype_create( oldType->desc.used + 2 );
if( (bLength == stride) || (1 >= count) ) { /* the elements are contiguous */ if( (bLength == stride) || (1 == count) ) { /* the elements are contiguous */
opal_datatype_add( pData, oldType, count * bLength, 0, extent ); opal_datatype_add( pData, oldType, count * bLength, 0, extent );
} else { } else {
if( 1 == bLength ) { if( 1 == bLength ) {
@ -476,7 +476,7 @@ static int32_t opal_datatype_create_hvector( int count, int bLength, ptrdiff_t s
} }
pTempData = opal_datatype_create( oldType->desc.used + 2 ); pTempData = opal_datatype_create( oldType->desc.used + 2 );
if( ((extent * bLength) == stride) || (1 >= count) ) { /* contiguous */ if( ((extent * bLength) == stride) || (1 == count) ) { /* contiguous */
pData = pTempData; pData = pTempData;
opal_datatype_add( pData, oldType, count * bLength, 0, extent ); opal_datatype_add( pData, oldType, count * bLength, 0, extent );
} else { } else {

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; -*- */
/* /*
* Copyright (c) 2004-2014 The University of Tennessee and The University * Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -15,8 +15,9 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h> #include <unistd.h>
#include <math.h>
#if OPEN_MPI && 0 #if 0 && OPEN_MPI
extern void ompi_datatype_dump( MPI_Datatype ddt ); extern void ompi_datatype_dump( MPI_Datatype ddt );
#define MPI_DDT_DUMP(ddt) ompi_datatype_dump( (ddt) ) #define MPI_DDT_DUMP(ddt) ompi_datatype_dump( (ddt) )
#else #else
@ -178,23 +179,145 @@ create_indexed_gap_optimized_ddt( void )
return dt3; return dt3;
} }
static void print_result( int length, int cycles, double time )
{
double bandwidth, clock_prec;
/********************************************************************
*******************************************************************/
#define DO_CONTIG 0x00000001
#define DO_CONSTANT_GAP 0x00000002
#define DO_INDEXED_GAP 0x00000004
#define DO_OPTIMIZED_INDEXED_GAP 0x00000008
#define DO_STRUCT_CONSTANT_GAP_RESIZED 0x00000010
#define DO_PACK 0x01000000
#define DO_UNPACK 0x02000000
#define DO_ISEND_RECV 0x04000000
#define DO_ISEND_IRECV 0x08000000
#define DO_IRECV_SEND 0x10000000
#define DO_IRECV_ISEND 0x20000000
#define MIN_LENGTH 1024
#define MAX_LENGTH (1024*1024)
static int cycles = 100;
static int trials = 20;
static int warmups = 2;
static void print_result( int length, int trials, double* timers )
{
double bandwidth, clock_prec, temp;
double min_time, max_time, average, std_dev = 0.0;
double ordered[trials];
int t, pos, quartile_start, quartile_end;
for( t = 0; t < trials; ordered[t] = timers[t], t++ );
for( t = 0; t < trials-1; t++ ) {
temp = ordered[t];
pos = t;
for( int i = t+1; i < trials; i++ ) {
if( temp > ordered[i] ) {
temp = ordered[i];
pos = i;
}
}
if( pos != t ) {
temp = ordered[t];
ordered[t] = ordered[pos];
ordered[pos] = temp;
}
}
quartile_start = trials - (3 * trials) / 4;
quartile_end = trials - (1 * trials) / 4;
clock_prec = MPI_Wtick(); clock_prec = MPI_Wtick();
bandwidth = (length * clock_prec * cycles) / (1024.0 * 1024.0) / (time * clock_prec); min_time = ordered[quartile_start];
printf( "%8d\t%.6f\t%.4f MB/s\n", length, time / cycles, bandwidth ); max_time = ordered[quartile_start];
average = ordered[quartile_start];
for( t = quartile_start + 1; t < quartile_end; t++ ) {
if( min_time > ordered[t] ) min_time = ordered[t];
if( max_time < ordered[t] ) max_time = ordered[t];
average += ordered[t];
}
average /= (quartile_end - quartile_start);
for( t = quartile_start; t < quartile_end; t++ ) {
std_dev += (ordered[t] - average) * (ordered[t] - average);
}
std_dev = sqrt( std_dev/(quartile_end - quartile_start) );
bandwidth = (length * clock_prec) / (1024.0 * 1024.0) / (average * clock_prec);
printf( "%8d\t%15g\t%10.4f MB/s [min %10g max %10g std %2.2f%%]\n", length, average, bandwidth,
min_time, max_time, (100.0 * std_dev) / average );
}
static int pack( int cycles,
MPI_Datatype sdt, int scount, void* sbuf,
void* packed_buf )
{
int position, myself, c, t, outsize;
double timers[trials];
MPI_Type_size( sdt, &outsize );
outsize *= scount;
MPI_Comm_rank( MPI_COMM_WORLD, &myself );
for( t = 0; t < warmups; t++ ) {
for( c = 0; c < cycles; c++ ) {
position = 0;
MPI_Pack(sbuf, scount, sdt, packed_buf, outsize, &position, MPI_COMM_WORLD);
}
}
for( t = 0; t < trials; t++ ) {
timers[t] = MPI_Wtime();
for( c = 0; c < cycles; c++ ) {
position = 0;
MPI_Pack(sbuf, scount, sdt, packed_buf, outsize, &position, MPI_COMM_WORLD);
}
timers[t] = (MPI_Wtime() - timers[t]) / cycles;
}
print_result( outsize, trials, timers );
return 0;
}
static int unpack( int cycles,
void* packed_buf,
MPI_Datatype rdt, int rcount, void* rbuf )
{
int position, myself, c, t, insize;
double timers[trials];
MPI_Type_size( rdt, &insize );
insize *= rcount;
MPI_Comm_rank( MPI_COMM_WORLD, &myself );
for( t = 0; t < warmups; t++ ) {
for( c = 0; c < cycles; c++ ) {
position = 0;
MPI_Unpack(packed_buf, insize, &position, rbuf, rcount, rdt, MPI_COMM_WORLD);
}
}
for( t = 0; t < trials; t++ ) {
timers[t] = MPI_Wtime();
for( c = 0; c < cycles; c++ ) {
position = 0;
MPI_Unpack(packed_buf, insize, &position, rbuf, rcount, rdt, MPI_COMM_WORLD);
}
timers[t] = (MPI_Wtime() - timers[t]) / cycles;
}
print_result( insize, trials, timers );
return 0;
} }
static int isend_recv( int cycles, static int isend_recv( int cycles,
MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype sdt, int scount, void* sbuf,
MPI_Datatype rdt, int rcount, void* rbuf ) MPI_Datatype rdt, int rcount, void* rbuf )
{ {
int myself, tag = 0, i, slength, rlength; int myself, tag = 0, c, t, slength, rlength;
MPI_Status status; MPI_Status status;
MPI_Request req; MPI_Request req;
double tstart, tend; double timers[trials];
MPI_Type_size( sdt, &slength ); MPI_Type_size( sdt, &slength );
slength *= scount; slength *= scount;
@ -203,21 +326,16 @@ static int isend_recv( int cycles,
MPI_Comm_rank( MPI_COMM_WORLD, &myself ); MPI_Comm_rank( MPI_COMM_WORLD, &myself );
tstart = MPI_Wtime(); for( t = 0; t < trials; t++ ) {
for( i = 0; i < cycles; i++ ) { timers[t] = MPI_Wtime();
#ifndef FAST for( c = 0; c < cycles; c++ ) {
MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req ); MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req );
MPI_Recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status ); MPI_Recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status );
MPI_Wait( &req, &status ); MPI_Wait( &req, &status );
/*MPI_Request_free( &req );*/
#else
ftmpi_mpi_isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req );
ftmpi_mpi_recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status );
ftmpi_request_free( &req );
#endif
} }
tend = MPI_Wtime(); timers[t] = (MPI_Wtime() - timers[t]) / cycles;
print_result( rlength, cycles, tend - tstart ); }
print_result( rlength, trials, timers );
return 0; return 0;
} }
@ -225,10 +343,10 @@ static int irecv_send( int cycles,
MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype sdt, int scount, void* sbuf,
MPI_Datatype rdt, int rcount, void* rbuf ) MPI_Datatype rdt, int rcount, void* rbuf )
{ {
int myself, tag = 0, i, slength, rlength; int myself, tag = 0, c, t, slength, rlength;
MPI_Request req; MPI_Request req;
MPI_Status status; MPI_Status status;
double tstart, tend; double timers[trials];
MPI_Type_size( sdt, &slength ); MPI_Type_size( sdt, &slength );
slength *= scount; slength *= scount;
@ -237,21 +355,16 @@ static int irecv_send( int cycles,
MPI_Comm_rank( MPI_COMM_WORLD, &myself ); MPI_Comm_rank( MPI_COMM_WORLD, &myself );
tstart = MPI_Wtime(); for( t = 0; t < trials; t++ ) {
for( i = 0; i < cycles; i++ ) { timers[t] = MPI_Wtime();
#ifndef FAST for( c = 0; c < cycles; c++ ) {
MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req ); MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req );
MPI_Send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD ); MPI_Send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD );
MPI_Wait( &req, &status ); MPI_Wait( &req, &status );
/*MPI_Request_free( &req );*/
#else
ftmpi_mpi_irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req );
ftmpi_mpi_send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD );
ftmpi_request_free( &req );
#endif
} }
tend = MPI_Wtime(); timers[t] = (MPI_Wtime() - timers[t]) / cycles;
print_result( rlength, cycles, tend - tstart ); }
print_result( rlength, trials, timers );
return 0; return 0;
} }
@ -259,10 +372,10 @@ static int isend_irecv_wait( int cycles,
MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype sdt, int scount, void* sbuf,
MPI_Datatype rdt, int rcount, void* rbuf ) MPI_Datatype rdt, int rcount, void* rbuf )
{ {
int myself, tag = 0, i, slength, rlength; int myself, tag = 0, c, t, slength, rlength;
MPI_Request sreq, rreq; MPI_Request requests[2];
MPI_Status status; MPI_Status statuses[2];
double tstart, tend; double timers[trials];
MPI_Type_size( sdt, &slength ); MPI_Type_size( sdt, &slength );
slength *= scount; slength *= scount;
@ -271,25 +384,16 @@ static int isend_irecv_wait( int cycles,
MPI_Comm_rank( MPI_COMM_WORLD, &myself ); MPI_Comm_rank( MPI_COMM_WORLD, &myself );
tstart = MPI_Wtime(); for( t = 0; t < trials; t++ ) {
for( i = 0; i < cycles; i++ ) { timers[t] = MPI_Wtime();
#ifndef FAST for( c = 0; c < cycles; c++ ) {
MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &requests[0] );
MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &requests[1] );
MPI_Wait( &sreq, &status ); MPI_Waitall( 2, requests, statuses );
MPI_Wait( &rreq, &status );
/*MPI_Request_free( &sreq );*/
/*MPI_Request_free( &rreq );*/
#else
ftmpi_mpi_isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq );
ftmpi_mpi_irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq );
ftmpi_wait( &sreq, &status );
ftmpi_request_free( &sreq );
ftmpi_request_free( &rreq );
#endif
} }
tend = MPI_Wtime(); timers[t] = (MPI_Wtime() - timers[t]) / cycles;
print_result( rlength, cycles, tend - tstart ); }
print_result( rlength, trials, timers );
return 0; return 0;
} }
@ -297,10 +401,10 @@ static int irecv_isend_wait( int cycles,
MPI_Datatype sdt, int scount, void* sbuf, MPI_Datatype sdt, int scount, void* sbuf,
MPI_Datatype rdt, int rcount, void* rbuf ) MPI_Datatype rdt, int rcount, void* rbuf )
{ {
int myself, tag = 0, i, slength, rlength; int myself, tag = 0, c, t, slength, rlength;
MPI_Request sreq, rreq; MPI_Request requests[2];
MPI_Status status; MPI_Status statuses[2];
double tstart, tend; double timers[trials];
MPI_Type_size( sdt, &slength ); MPI_Type_size( sdt, &slength );
slength *= scount; slength *= scount;
@ -309,73 +413,81 @@ static int irecv_isend_wait( int cycles,
MPI_Comm_rank( MPI_COMM_WORLD, &myself ); MPI_Comm_rank( MPI_COMM_WORLD, &myself );
tstart = MPI_Wtime(); for( t = 0; t < trials; t++ ) {
for( i = 0; i < cycles; i++ ) { timers[t] = MPI_Wtime();
#ifndef FAST for( c = 0; c < cycles; c++ ) {
MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq ); MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &requests[0] );
MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq ); MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &requests[1] );
MPI_Wait( &sreq, &status ); MPI_Waitall( 2, requests, statuses );
MPI_Wait( &rreq, &status );
/*MPI_Request_free( &sreq );*/
/*MPI_Request_free( &rreq );*/
#else
ftmpi_mpi_irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &rreq );
ftmpi_mpi_isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &sreq );
ftmpi_wait( &sreq, &status );
ftmpi_request_free( &sreq );
ftmpi_request_free( &rreq );
#endif
} }
tend = MPI_Wtime(); timers[t] = (MPI_Wtime() - timers[t]) / cycles;
print_result( rlength, cycles, tend - tstart ); }
print_result( rlength, trials, timers);
return 0; return 0;
} }
static int do_test_for_ddt( MPI_Datatype sddt, MPI_Datatype rddt, int length ) static int do_test_for_ddt( int doop, MPI_Datatype sddt, MPI_Datatype rddt, int length )
{ {
int i;
MPI_Aint lb, extent; MPI_Aint lb, extent;
char *sbuf, *rbuf; char *sbuf, *rbuf;
int i;
MPI_Type_get_extent( sddt, &lb, &extent ); MPI_Type_get_extent( sddt, &lb, &extent );
sbuf = (char*)malloc( length ); sbuf = (char*)malloc( length );
rbuf = (char*)malloc( length ); rbuf = (char*)malloc( length );
if( doop & DO_PACK ) {
printf("# Pack (max length %d)\n", length);
for( i = 1; i <= (length/extent); i *= 2 ) {
pack( cycles, sddt, i, sbuf, rbuf );
}
}
if( doop & DO_UNPACK ) {
printf("# Unpack (length %d)\n", length);
for( i = 1; i <= (length/extent); i *= 2 ) {
unpack( cycles, sbuf, rddt, i, rbuf );
}
}
if( doop & DO_ISEND_RECV ) {
printf( "# Isend recv (length %d)\n", length ); printf( "# Isend recv (length %d)\n", length );
for( i = 1; i <= (length/extent); i *= 2 ) { for( i = 1; i <= (length/extent); i *= 2 ) {
isend_recv( 10, sddt, i, sbuf, rddt, i, rbuf ); isend_recv( cycles, sddt, i, sbuf, rddt, i, rbuf );
} }
}
if( doop & DO_ISEND_IRECV ) {
printf( "# Isend Irecv Wait (length %d)\n", length ); printf( "# Isend Irecv Wait (length %d)\n", length );
for( i = 1; i <= (length/extent); i *= 2 ) { for( i = 1; i <= (length/extent); i *= 2 ) {
isend_irecv_wait( 10, sddt, i, sbuf, rddt, i, rbuf ); isend_irecv_wait( cycles, sddt, i, sbuf, rddt, i, rbuf );
} }
}
if( doop & DO_IRECV_SEND ) {
printf( "# Irecv send (length %d)\n", length ); printf( "# Irecv send (length %d)\n", length );
for( i = 1; i <= (length/extent); i *= 2 ) { for( i = 1; i <= (length/extent); i *= 2 ) {
irecv_send( 10, sddt, i, sbuf, rddt, i, rbuf ); irecv_send( cycles, sddt, i, sbuf, rddt, i, rbuf );
} }
}
if( doop & DO_IRECV_SEND ) {
printf( "# Irecv Isend Wait (length %d)\n", length ); printf( "# Irecv Isend Wait (length %d)\n", length );
for( i = 1; i <= (length/extent); i *= 2 ) { for( i = 1; i <= (length/extent); i *= 2 ) {
irecv_isend_wait( 10, sddt, i, sbuf, rddt, i, rbuf ); irecv_isend_wait( cycles, sddt, i, sbuf, rddt, i, rbuf );
}
} }
free( sbuf ); free( sbuf );
free( rbuf ); free( rbuf );
return 0; return 0;
} }
#define DO_CONTIG 0x01
#define DO_CONSTANT_GAP 0x02
#define DO_INDEXED_GAP 0x04
#define DO_OPTIMIZED_INDEXED_GAP 0x08
#define DO_STRUCT_CONSTANT_GAP_RESIZED 0x10
#define MIN_LENGTH 1024
#define MAX_LENGTH (1024*1024)
int main( int argc, char* argv[] ) int main( int argc, char* argv[] )
{ {
int run_tests = 0xffffffff; /* do all tests by default */ int run_tests = 0xffff; /* do all datatype tests by default */
int length, rank, size; int rank, size;
MPI_Datatype ddt; MPI_Datatype ddt;
/*int run_tests = DO_CONSTANT_GAP;*/
run_tests |= DO_PACK | DO_UNPACK;
MPI_Init (&argc, &argv); MPI_Init (&argc, &argv);
@ -389,16 +501,14 @@ int main( int argc, char* argv[] )
if( run_tests & DO_CONTIG ) { if( run_tests & DO_CONTIG ) {
printf( "\ncontiguous datatype\n\n" ); printf( "\ncontiguous datatype\n\n" );
for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) do_test_for_ddt( run_tests, MPI_INT, MPI_INT, MAX_LENGTH );
do_test_for_ddt( MPI_INT, MPI_INT, length );
} }
if( run_tests & DO_INDEXED_GAP ) { if( run_tests & DO_INDEXED_GAP ) {
printf( "\nindexed gap\n\n" ); printf( "\nindexed gap\n\n" );
ddt = create_indexed_gap_ddt(); ddt = create_indexed_gap_ddt();
MPI_DDT_DUMP( ddt ); MPI_DDT_DUMP( ddt );
for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
do_test_for_ddt( ddt, ddt, length );
MPI_Type_free( &ddt ); MPI_Type_free( &ddt );
} }
@ -406,8 +516,7 @@ int main( int argc, char* argv[] )
printf( "\noptimized indexed gap\n\n" ); printf( "\noptimized indexed gap\n\n" );
ddt = create_indexed_gap_optimized_ddt(); ddt = create_indexed_gap_optimized_ddt();
MPI_DDT_DUMP( ddt ); MPI_DDT_DUMP( ddt );
for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
do_test_for_ddt( ddt, ddt, length );
MPI_Type_free( &ddt ); MPI_Type_free( &ddt );
} }
@ -415,8 +524,7 @@ int main( int argc, char* argv[] )
printf( "\nconstant indexed gap\n\n" ); printf( "\nconstant indexed gap\n\n" );
ddt = create_indexed_constant_gap_ddt( 80, 100, 1 ); ddt = create_indexed_constant_gap_ddt( 80, 100, 1 );
MPI_DDT_DUMP( ddt ); MPI_DDT_DUMP( ddt );
for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
do_test_for_ddt( ddt, ddt, length );
MPI_Type_free( &ddt ); MPI_Type_free( &ddt );
} }
@ -424,8 +532,7 @@ int main( int argc, char* argv[] )
printf( "\noptimized constant indexed gap\n\n" ); printf( "\noptimized constant indexed gap\n\n" );
ddt = create_optimized_indexed_constant_gap_ddt( 80, 100, 1 ); ddt = create_optimized_indexed_constant_gap_ddt( 80, 100, 1 );
MPI_DDT_DUMP( ddt ); MPI_DDT_DUMP( ddt );
for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
do_test_for_ddt( ddt, ddt, length );
MPI_Type_free( &ddt ); MPI_Type_free( &ddt );
} }
@ -433,8 +540,7 @@ int main( int argc, char* argv[] )
printf( "\nstruct constant gap resized\n\n" ); printf( "\nstruct constant gap resized\n\n" );
ddt = create_struct_constant_gap_resized_ddt( 0 /* unused */, 0 /* unused */, 0 /* unused */ ); ddt = create_struct_constant_gap_resized_ddt( 0 /* unused */, 0 /* unused */, 0 /* unused */ );
MPI_DDT_DUMP( ddt ); MPI_DDT_DUMP( ddt );
for( length = MIN_LENGTH; length < MAX_LENGTH; length <<=1 ) do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
do_test_for_ddt( ddt, ddt, length );
MPI_Type_free( &ddt ); MPI_Type_free( &ddt );
} }

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */ /* -*- Mode: C; c-basic-offset:4 ; -*- */
/* /*
* Copyright (c) 2014 The University of Tennessee and The University * Copyright (c) 2014-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2014 Research Organization for Information Science * Copyright (c) 2014 Research Organization for Information Science
@ -18,7 +18,6 @@
#include "opal/runtime/opal.h" #include "opal/runtime/opal.h"
#include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_internal.h" #include "opal/datatype/opal_datatype_internal.h"
// #include <mpi.h>
#include <time.h> #include <time.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
@ -61,6 +60,18 @@ static void print_bar_pbar(struct foo_t* bar, struct pfoo_t* pbar)
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
static void print_stack(opal_convertor_t* conv)
{
printf("Stack pos %d [converted %" PRIsize_t "/%" PRIsize_t "]\n",
conv->stack_pos, conv->bConverted, conv->local_size);
for( uint32_t i = 0; i <= conv->stack_pos; i++ ) {
printf( "[%u] index %d, type %s count %" PRIsize_t " disp %p\n",
i, conv->pStack[i].index, opal_datatype_basicDatatypes[conv->pStack[i].type]->name,
conv->pStack[i].count, (void*)conv->pStack[i].disp);
}
printf("\n");
}
static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) { static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) {
int i, j, errors = 0; int i, j, errors = 0;
struct iovec a; struct iovec a;
@ -104,6 +115,7 @@ static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) {
max_data = a.iov_len; max_data = a.iov_len;
pos = arr[i][1]; pos = arr[i][1];
opal_convertor_set_position(pConv, &pos); opal_convertor_set_position(pConv, &pos);
print_stack(pConv);
assert(arr[i][1] == pos); assert(arr[i][1] == pos);
opal_convertor_unpack( pConv, &a, &iov_count, &max_data ); opal_convertor_unpack( pConv, &a, &iov_count, &max_data );
a.iov_base = (char*)a.iov_base - 1024; a.iov_base = (char*)a.iov_base - 1024;
@ -118,9 +130,10 @@ static int testcase(ompi_datatype_t * newtype, size_t arr[10][2]) {
bar[j].d[1] != 0.0 || bar[j].d[1] != 0.0 ||
bar[j].d[2] != pbar[j].d[1]) { bar[j].d[2] != pbar[j].d[1]) {
if(0 == errors) { if(0 == errors) {
fprintf(stderr, "ERROR ! count=%d, position=%d, ptr = %p" (void)opal_datatype_dump(&newtype->super);
fprintf(stderr, "ERROR ! position=%d/%d, ptr = %p"
" got (%d,%d,%d,%g,%g,%g) expected (%d,%d,%d,%g,%g,%g)\n", " got (%d,%d,%d,%g,%g,%g) expected (%d,%d,%d,%g,%g,%g)\n",
N, j, (void*)&bar[j], j, N, (void*)&bar[j],
bar[j].i[0], bar[j].i[0],
bar[j].i[1], bar[j].i[1],
bar[j].i[2], bar[j].i[2],