From 36ce319869676f912f9979ccc014511d8d27f3d2 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 12 Nov 2010 23:22:35 +0000 Subject: [PATCH] Add a second version of the datatype copy function using memmove instead of memcpy. As memmove is slower than memcpy, I added the required logic to only use it when really necessary. No modification from developers point of view, you should always call opal_datatype_copy_content_same_ddt. This commit was SVN r24047. --- opal/datatype/Makefile.am | 3 +- opal/datatype/opal_datatype_copy.c | 251 +++++---------------------- opal/datatype/opal_datatype_copy.h | 248 ++++++++++++++++++++++++++ opal/datatype/opal_datatype_memcpy.h | 28 --- 4 files changed, 295 insertions(+), 235 deletions(-) create mode 100644 opal/datatype/opal_datatype_copy.h diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am index 6a4cef3f9e..bb39724929 100644 --- a/opal/datatype/Makefile.am +++ b/opal/datatype/Makefile.am @@ -3,7 +3,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2009 The University of Tennessee and The University +# Copyright (c) 2004-2010 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -27,6 +27,7 @@ headers = \ opal_datatype_checksum.h \ opal_datatype.h \ opal_datatype_internal.h \ + opal_datatype_copy.h \ opal_datatype_memcpy.h \ opal_datatype_pack.h \ opal_datatype_prototypes.h \ diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c index f3d2697e67..888ae5adda 100644 --- a/opal/datatype/opal_datatype_copy.c +++ b/opal/datatype/opal_datatype_copy.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2010 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -41,232 +41,71 @@ extern int opal_copy_debug; #define DO_DEBUG(INST) #endif /* OPAL_ENABLE_DEBUG */ -size_t opal_datatype_memcpy_block_size = 128 * 1024; +static size_t opal_datatype_memop_block_size = 128 * 1024; -static inline void copy_predefined_data( const dt_elem_desc_t* ELEM, - const opal_datatype_t* DATATYPE, - unsigned char* SOURCE_BASE, - int32_t TOTAL_COUNT, - uint32_t COUNT, - unsigned char* SOURCE, - unsigned char* DESTINATION, - size_t* SPACE ) -{ - uint32_t _copy_count = (COUNT); - size_t _copy_blength; - const ddt_elem_desc_t* _elem = &((ELEM)->elem); - unsigned char* _source = (SOURCE) + _elem->disp; - unsigned char* _destination = (DESTINATION) + _elem->disp; +/** + * Non overlapping memory regions + */ +#undef MEM_OP_NAME +#define MEM_OP_NAME non_overlap +#undef MEM_OP +#define MEM_OP MEMCPY +#include "opal_datatype_copy.h" - _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; +#define MEMMOVE(d, s, l) \ + do { \ + if( (((d) < (s)) && (((d) + (l)) > (s))) || \ + (((s) < (d)) && (((s) + (l)) > (s))) ) { \ + memmove( (d), (s), (l) ); \ + } else { \ + MEMCPY( (d), (s), (l) ); \ + } \ + } while (0) - if( _copy_blength == (uint32_t)_elem->extent ) { - _copy_blength *= _copy_count; - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - /* the extent and the size of the basic datatype are equals */ - DO_DEBUG( opal_output( 0, "copy 1. memcpy( %p, %p, %lu ) => space %lu\n", - _destination, _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); ); - MEMCPY( _destination, _source, _copy_blength ); - _source += _copy_blength; - _destination += _copy_blength; - } else { - uint32_t _i; - for( _i = 0; _i < _copy_count; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - DO_DEBUG( opal_output( 0, "copy 2. memcpy( %p, %p, %lu ) => space %lu\n", - _destination, _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); - MEMCPY( _destination, _source, _copy_blength ); - _source += _elem->extent; - _destination += _elem->extent; - } - _copy_blength *= _copy_count; - } - *(SPACE) -= _copy_blength; -} - -static inline void copy_contiguous_loop( const dt_elem_desc_t* ELEM, - const opal_datatype_t* DATATYPE, - unsigned char* SOURCE_BASE, - int32_t TOTAL_COUNT, - uint32_t COUNT, - unsigned char* SOURCE, - unsigned char* DESTINATION, - size_t* SPACE ) -{ - ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); - ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); - unsigned char* _source = (SOURCE) + _end_loop->first_elem_disp; - unsigned char* _destination = (DESTINATION) + _end_loop->first_elem_disp; - size_t _copy_loops = (COUNT); - uint32_t _i; - - if( _loop->extent == (OPAL_PTRDIFF_TYPE)_end_loop->size ) { /* the loop is contiguous */ - _copy_loops *= _end_loop->size; - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_loops, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - MEMCPY( _destination, _source, _copy_loops ); - } else { - for( _i = 0; _i < _copy_loops; _i++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _end_loop->size, (SOURCE_BASE), - (DATATYPE), (TOTAL_COUNT) ); - DO_DEBUG( opal_output( 0, "copy 3. memcpy( %p, %p, %lu ) => space %lu\n", - _destination, _source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); - MEMCPY( _destination, _source, _end_loop->size ); - _source += _loop->extent; - _destination += _loop->extent; - } - _copy_loops *= _end_loop->size; - } - *(SPACE) -= _copy_loops; -} - -#define COPY_PREDEFINED_DATATYPE( ELEM, DATATYPE, SOURCE_BASE, TOTAL_COUNT, COUNT, SOURCE, DESTINATION, SPACE ) \ - copy_predefined_data( (ELEM), (DATATYPE), (SOURCE_BASE), (TOTAL_COUNT), \ - (COUNT), (SOURCE), (DESTINATION), &(SPACE) ) - -#define COPY_CONTIGUOUS_LOOP( ELEM, DATATYPE, SOURCE_BASE, TOTAL_COUNT, COUNT, SOURCE, DESTINATION, SPACE ) \ - copy_contiguous_loop( (ELEM), (DATATYPE), (SOURCE_BASE), (TOTAL_COUNT), \ - (COUNT), (SOURCE), (DESTINATION), &(SPACE) ) +/** + * Overlapping memory regions + */ +#undef MEM_OP_NAME +#define MEM_OP_NAME overlap +#undef MEM_OP +#define MEM_OP MEMMOVE +#include "opal_datatype_copy.h" int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, int32_t count, char* destination_base, char* source_base ) { - dt_stack_t* pStack; /* pointer to the position on the stack */ - int32_t stack_pos; /* index of the stack level */ - uint32_t pos_desc; /* actual position in the description of the derived datatype */ - uint32_t count_desc; /* the number of items already done in the actual pos_desc */ - dt_elem_desc_t* description; - dt_elem_desc_t* pElem; + OPAL_PTRDIFF_TYPE extent; size_t iov_len_local; - unsigned char *source = (unsigned char*)source_base, - *destination = (unsigned char*)destination_base; + int32_t (*fct)( const opal_datatype_t*, int32_t, char*, char*); DO_DEBUG( opal_output( 0, "opal_datatype_copy_content_same_ddt( %p, %d, dst %p, src %p )\n", (void*)datatype, count, destination_base, source_base ); ); + /* empty data ? then do nothing. This should normally be trapped * at a higher level. */ if( 0 == count ) return 1; iov_len_local = count * datatype->size; - - /* If we have to copy a contiguous datatype then simply - * do a memcpy. + /** + * see discussion in coll_basic_reduce.c for the computation of extent when + * count != 1. Short version of the story: + * (true_extent + ((count - 1) * extent)) */ - if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { - OPAL_PTRDIFF_TYPE extent = (datatype->ub - datatype->lb); - /* Now that we know the datatype is contiguous, we should move the 2 pointers - * source and destination to the correct displacement. - */ - destination += datatype->true_lb; - source += datatype->true_lb; - if( (OPAL_PTRDIFF_TYPE)datatype->size == extent ) { /* all contiguous == no gaps around */ - size_t total_length = iov_len_local; - size_t memcpy_chunk = opal_datatype_memcpy_block_size; - while( total_length > 0 ) { - if( memcpy_chunk > total_length ) memcpy_chunk = total_length; - OPAL_DATATYPE_SAFEGUARD_POINTER( destination, memcpy_chunk, - (unsigned char*)destination_base, datatype, count ); - OPAL_DATATYPE_SAFEGUARD_POINTER( source, memcpy_chunk, - (unsigned char*)source_base, datatype, count ); - DO_DEBUG( opal_output( 0, "copy c1. memcpy( %p, %p, %lu ) => space %lu\n", - destination, source, (unsigned long)memcpy_chunk, (unsigned long)total_length ); ); - MEMCPY( destination, source, memcpy_chunk ); - destination += memcpy_chunk; - source += memcpy_chunk; - total_length -= memcpy_chunk; - } - return 0; /* completed */ - } - for( pos_desc = 0; (int32_t)pos_desc < count; pos_desc++ ) { - OPAL_DATATYPE_SAFEGUARD_POINTER( destination, datatype->size, - (unsigned char*)destination_base, datatype, count ); - OPAL_DATATYPE_SAFEGUARD_POINTER( source, datatype->size, - (unsigned char*)source_base, datatype, count ); - DO_DEBUG( opal_output( 0, "copy c2. memcpy( %p, %p, %lu ) => space %lu\n", - destination, source, (unsigned long)datatype->size, - (unsigned long)(iov_len_local - (pos_desc * datatype->size)) ); ); - MEMCPY( destination, source, datatype->size ); - destination += extent; - source += extent; - } - return 0; /* completed */ - } + extent = (datatype->true_ub - datatype->true_lb) + (count - 1) * (datatype->ub - datatype->lb); - pStack = (dt_stack_t*)alloca( sizeof(dt_stack_t) * (datatype->btypes[OPAL_DATATYPE_LOOP] + 1) ); - pStack->count = count; - pStack->index = -1; - pStack->disp = 0; - pos_desc = 0; - stack_pos = 0; - - if( datatype->opt_desc.desc != NULL ) { - description = datatype->opt_desc.desc; + fct = non_overlap_copy_content_same_ddt; + if( destination_base < source_base ) { + if( (destination_base + extent) > source_base ) { + /* memmove */ + fct = overlap_copy_content_same_ddt; + } } else { - description = datatype->desc.desc; - } - - if( description[0].elem.common.type == OPAL_DATATYPE_LOOP ) - count_desc = description[0].loop.loops; - else - count_desc = description[0].elem.count; - pElem = &(description[pos_desc]); - - while( 1 ) { - while( OPAL_LIKELY(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) ) { - /* now here we have a basic datatype */ - COPY_PREDEFINED_DATATYPE( pElem, datatype, (unsigned char*)source_base, count, count_desc, - source, destination, iov_len_local ); - pos_desc++; /* advance to the next data */ - UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); - } - if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ - DO_DEBUG( opal_output( 0, "copy end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n", - (int)pStack->count, stack_pos, pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); ); - if( --(pStack->count) == 0 ) { /* end of loop */ - if( stack_pos == 0 ) { - assert( iov_len_local == 0 ); - return 0; /* completed */ - } - stack_pos--; - pStack--; - pos_desc++; - } else { - pos_desc = pStack->index + 1; - if( pStack->index == -1 ) { - pStack->disp += (datatype->ub - datatype->lb); - } else { - assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type ); - pStack->disp += description[pStack->index].loop.extent; - } - } - source = (unsigned char*)source_base + pStack->disp; - destination = (unsigned char*)destination_base + pStack->disp; - UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); - DO_DEBUG( opal_output( 0, "copy new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n", - (int)pStack->count, stack_pos, pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); ); - } - if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { - OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)source; - if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { - COPY_CONTIGUOUS_LOOP( pElem, datatype, (unsigned char*)source_base, count, count_desc, - source, destination, iov_len_local ); - pos_desc += pElem->loop.items + 1; - goto update_loop_description; - } - local_disp = (OPAL_PTRDIFF_TYPE)source - local_disp; - PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc, - pStack->disp + local_disp); - pos_desc++; - update_loop_description: /* update the current state */ - source = (unsigned char*)source_base + pStack->disp; - destination = (unsigned char*)destination_base + pStack->disp; - UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); - DDT_DUMP_STACK( pStack, stack_pos, pElem, "advance loop" ); - continue; + if( (source_base + extent) > destination_base ) { + /* memmove */ + fct = overlap_copy_content_same_ddt; } } + return fct( datatype, count, destination_base, source_base ); } diff --git a/opal/datatype/opal_datatype_copy.h b/opal/datatype/opal_datatype_copy.h new file mode 100644 index 0000000000..86ed92def0 --- /dev/null +++ b/opal/datatype/opal_datatype_copy.h @@ -0,0 +1,248 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MEM_OP_NAME) +#error +#endif /* !defined((MEM_OP_NAME) */ +#if !defined(MEM_OP) +#error +#endif /* !defined(MEM_OP) */ + +#ifndef STRINGIFY +# define STRINGIFY_(arg) #arg +# define STRINGIFY(arg) STRINGIFY_(arg) +#endif + +#ifndef DT_CONCAT +# define DT__CONCAT(a, b) a##b +# define DT_CONCAT(a, b) DT__CONCAT(a, b) +#endif + + +#define _predefined_data DT_CONCAT(MEM_OP_NAME,_predefined_data) +#define _contiguous_loop DT_CONCAT(MEM_OP_NAME,_contiguous_loop) +#define _copy_content_same_ddt DT_CONCAT(MEM_OP_NAME,_copy_content_same_ddt) + +static inline void _predefined_data( const dt_elem_desc_t* ELEM, + const opal_datatype_t* DATATYPE, + unsigned char* SOURCE_BASE, + int32_t TOTAL_COUNT, + uint32_t COUNT, + unsigned char* SOURCE, + unsigned char* DESTINATION, + size_t* SPACE ) +{ + uint32_t _copy_count = (COUNT); + size_t _copy_blength; + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + unsigned char* _source = (SOURCE) + _elem->disp; + unsigned char* _destination = (DESTINATION) + _elem->disp; + + _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size; + + if( _copy_blength == (uint32_t)_elem->extent ) { + _copy_blength *= _copy_count; + OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + /* the extent and the size of the basic datatype are equals */ + DO_DEBUG( opal_output( 0, "copy 1. %s( %p, %p, %lu ) => space %lu\n", + STRINGIFY(MEM_OP_NAME), _destination, _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); ); + MEM_OP( _destination, _source, _copy_blength ); + _source += _copy_blength; + _destination += _copy_blength; + } else { + uint32_t _i; + for( _i = 0; _i < _copy_count; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + DO_DEBUG( opal_output( 0, "copy 2. %s( %p, %p, %lu ) => space %lu\n", + STRINGIFY(MEM_OP_NAME), _destination, _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); ); + MEM_OP( _destination, _source, _copy_blength ); + _source += _elem->extent; + _destination += _elem->extent; + } + _copy_blength *= _copy_count; + } + *(SPACE) -= _copy_blength; +} + +static inline void _contiguous_loop( const dt_elem_desc_t* ELEM, + const opal_datatype_t* DATATYPE, + unsigned char* SOURCE_BASE, + int32_t TOTAL_COUNT, + uint32_t COUNT, + unsigned char* SOURCE, + unsigned char* DESTINATION, + size_t* SPACE ) +{ + ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); + ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); + unsigned char* _source = (SOURCE) + _end_loop->first_elem_disp; + unsigned char* _destination = (DESTINATION) + _end_loop->first_elem_disp; + size_t _copy_loops = (COUNT); + uint32_t _i; + + if( _loop->extent == (OPAL_PTRDIFF_TYPE)_end_loop->size ) { /* the loop is contiguous */ + _copy_loops *= _end_loop->size; + OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _copy_loops, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + MEM_OP( _destination, _source, _copy_loops ); + } else { + for( _i = 0; _i < _copy_loops; _i++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _end_loop->size, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + DO_DEBUG( opal_output( 0, "copy 3. %s( %p, %p, %lu ) => space %lu\n", + STRINGIFY(MEM_OP_NAME), _destination, _source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); ); + MEM_OP( _destination, _source, _end_loop->size ); + _source += _loop->extent; + _destination += _loop->extent; + } + _copy_loops *= _end_loop->size; + } + *(SPACE) -= _copy_loops; +} + +static inline int32_t _copy_content_same_ddt( const opal_datatype_t* datatype, int32_t count, + char* destination_base, char* source_base ) +{ + dt_stack_t* pStack; /* pointer to the position on the stack */ + int32_t stack_pos; /* index of the stack level */ + uint32_t pos_desc; /* actual position in the description of the derived datatype */ + uint32_t count_desc; /* the number of items already done in the actual pos_desc */ + dt_elem_desc_t* description; + dt_elem_desc_t* pElem; + size_t iov_len_local; + unsigned char *source = (unsigned char*)source_base, + *destination = (unsigned char*)destination_base; + + DO_DEBUG( opal_output( 0, "_copy_content_same_ddt( %p, %d, dst %p, src %p )\n", + (void*)datatype, count, destination_base, source_base ); ); + + iov_len_local = count * datatype->size; + + /* If we have to copy a contiguous datatype then simply + * do a MEM_OP. + */ + if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { + OPAL_PTRDIFF_TYPE extent = (datatype->ub - datatype->lb); + /* Now that we know the datatype is contiguous, we should move the 2 pointers + * source and destination to the correct displacement. + */ + destination += datatype->true_lb; + source += datatype->true_lb; + if( (OPAL_PTRDIFF_TYPE)datatype->size == extent ) { /* all contiguous == no gaps around */ + size_t total_length = iov_len_local; + size_t memop_chunk = opal_datatype_memop_block_size; + while( total_length > 0 ) { + if( memop_chunk > total_length ) memop_chunk = total_length; + OPAL_DATATYPE_SAFEGUARD_POINTER( destination, memop_chunk, + (unsigned char*)destination_base, datatype, count ); + OPAL_DATATYPE_SAFEGUARD_POINTER( source, memop_chunk, + (unsigned char*)source_base, datatype, count ); + DO_DEBUG( opal_output( 0, "copy c1. %s( %p, %p, %lu ) => space %lu\n", + STRINGIFY(MEM_OP_NAME), destination, source, (unsigned long)memop_chunk, (unsigned long)total_length ); ); + MEM_OP( destination, source, memop_chunk ); + destination += memop_chunk; + source += memop_chunk; + total_length -= memop_chunk; + } + return 0; /* completed */ + } + for( pos_desc = 0; (int32_t)pos_desc < count; pos_desc++ ) { + OPAL_DATATYPE_SAFEGUARD_POINTER( destination, datatype->size, + (unsigned char*)destination_base, datatype, count ); + OPAL_DATATYPE_SAFEGUARD_POINTER( source, datatype->size, + (unsigned char*)source_base, datatype, count ); + DO_DEBUG( opal_output( 0, "copy c2. %s( %p, %p, %lu ) => space %lu\n", + STRINGIFY(MEM_OP_NAME), destination, source, (unsigned long)datatype->size, + (unsigned long)(iov_len_local - (pos_desc * datatype->size)) ); ); + MEM_OP( destination, source, datatype->size ); + destination += extent; + source += extent; + } + return 0; /* completed */ + } + + pStack = (dt_stack_t*)alloca( sizeof(dt_stack_t) * (datatype->btypes[OPAL_DATATYPE_LOOP] + 1) ); + pStack->count = count; + pStack->index = -1; + pStack->disp = 0; + pos_desc = 0; + stack_pos = 0; + + if( datatype->opt_desc.desc != NULL ) { + description = datatype->opt_desc.desc; + } else { + description = datatype->desc.desc; + } + + if( description[0].elem.common.type == OPAL_DATATYPE_LOOP ) + count_desc = description[0].loop.loops; + else + count_desc = description[0].elem.count; + pElem = &(description[pos_desc]); + + while( 1 ) { + while( OPAL_LIKELY(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) ) { + /* now here we have a basic datatype */ + _predefined_data( pElem, datatype, (unsigned char*)source_base, count, count_desc, + source, destination, &iov_len_local ); + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); + } + if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ + DO_DEBUG( opal_output( 0, "copy end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n", + (int)pStack->count, stack_pos, pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); ); + if( --(pStack->count) == 0 ) { /* end of loop */ + if( stack_pos == 0 ) { + assert( iov_len_local == 0 ); + return 0; /* completed */ + } + stack_pos--; + pStack--; + pos_desc++; + } else { + pos_desc = pStack->index + 1; + if( pStack->index == -1 ) { + pStack->disp += (datatype->ub - datatype->lb); + } else { + assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type ); + pStack->disp += description[pStack->index].loop.extent; + } + } + source = (unsigned char*)source_base + pStack->disp; + destination = (unsigned char*)destination_base + pStack->disp; + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); + DO_DEBUG( opal_output( 0, "copy new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n", + (int)pStack->count, stack_pos, pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); ); + } + if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { + OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)source; + if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) { + _contiguous_loop( pElem, datatype, (unsigned char*)source_base, count, count_desc, + source, destination, &iov_len_local ); + pos_desc += pElem->loop.items + 1; + goto update_loop_description; + } + local_disp = (OPAL_PTRDIFF_TYPE)source - local_disp; + PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc, + pStack->disp + local_disp); + pos_desc++; + update_loop_description: /* update the current state */ + source = (unsigned char*)source_base + pStack->disp; + destination = (unsigned char*)destination_base + pStack->disp; + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); + DDT_DUMP_STACK( pStack, stack_pos, pElem, "advance loop" ); + continue; + } + } +} diff --git a/opal/datatype/opal_datatype_memcpy.h b/opal/datatype/opal_datatype_memcpy.h index c6a5af87a3..972009ac96 100644 --- a/opal/datatype/opal_datatype_memcpy.h +++ b/opal/datatype/opal_datatype_memcpy.h @@ -14,35 +14,7 @@ #ifndef OPAL_DATATYPE_MEMCPY_H_HAS_BEEN_INCLUDED #define OPAL_DATATYPE_MEMCPY_H_HAS_BEEN_INCLUDED -extern void* mmx_memcpy( void* dst, const void* src, size_t n ); -extern void* mmx2_memcpy( void* dst, const void* src, size_t n ); -extern void* sse_memcpy( void* dst, const void* src, size_t n ); - -/* for small memory blocks (<256 bytes) this version is faster */ -#define small_memcpy(to,from,n) \ -{ \ - register unsigned long int dummy; \ - void *_dst = (to); \ - const void *_src = (from); \ - __asm__ __volatile__( "rep; movsb" \ - :"=&D"(_dst), "=&S"(_src), "=&c"(dummy) \ - :"0" (_dst), "1" (_src),"2" (n) \ - : "memory"); \ -} - #define MEMCPY( DST, SRC, BLENGTH ) \ memcpy( (DST), (SRC), (BLENGTH) ) -#if 0 -#define MEMCPY( DST, SRC, BLENGTH ) \ -do { \ - if( 128 > (BLENGTH) ) { \ - small_memcpy( (DST), (SRC), (BLENGTH) ); \ - } else if( (64*1024-100) > (BLENGTH) ) { \ - mmx_memcpy( (DST), (SRC), (BLENGTH) ); \ - } else {\ - mmx2_memcpy( (DST), (SRC), (BLENGTH) ); \ - } \ -} while (0) -#endif #endif /* OPAL_DATATYPE_MEMCPY_H_HAS_BEEN_INCLUDED */