From 08a29da9a95dea4a2272c21c1baf2b91bb05ca9e Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sun, 13 Nov 2005 22:57:28 +0000 Subject: [PATCH] New version of the send data to self for the case where both the send and the receive data are the same. This commit was SVN r8141. --- ompi/datatype/dt_copy.c | 268 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 ompi/datatype/dt_copy.c diff --git a/ompi/datatype/dt_copy.c b/ompi/datatype/dt_copy.c new file mode 100644 index 0000000000..964f90ac08 --- /dev/null +++ b/ompi/datatype/dt_copy.c @@ -0,0 +1,268 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "datatype/datatype.h" +#include "datatype/convertor.h" +#include "datatype/datatype_internal.h" + +#ifdef HAVE_ALLOCA_H +#include +#endif +#include + +#if OMPI_ENABLE_DEBUG +int32_t ompi_copy_debug = 0; +#define DO_DEBUG(INST) if( ompi_copy_debug ) { INST } +#else +#define DO_DEBUG(INST) +#endif /* OMPI_ENABLE_DEBUG */ + +size_t ompi_datatype_memcpy_block_size = 128 * 1024; + +static inline void copy_predefined_data( const dt_elem_desc_t* ELEM, + const ompi_datatype_t* DATATYPE, + char* SOURCE_BASE, + int32_t TOTAL_COUNT, + uint32_t COUNT, + char* SOURCE, + char* DESTINATION, + uint32_t* SPACE ) +{ + uint32_t _copy_count = (COUNT), _copy_blength; + const ddt_elem_desc_t* _elem = &((ELEM)->elem); + char* _source = (SOURCE) + _elem->disp; + char* _destination = (DESTINATION) + _elem->disp; + + _copy_blength = ompi_ddt_basicDatatypes[_elem->common.type]->size; + + if( _copy_blength == (uint32_t)_elem->extent ) { + _copy_blength *= _copy_count; + OMPI_DDT_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + /* the extent and the size of the basic datatype are equals */ + DO_DEBUG( opal_output( 0, "copy 1. memcpy( %p, %p, %ld ) => space %d\n", + _destination, _source, _copy_blength, *(SPACE) ); ); + MEMCPY( _destination, _source, _copy_blength ); + _source += _copy_blength; + _destination += _copy_blength; + } else { + uint32_t _i; + for( _i = 0; _i < _copy_count; _i++ ) { + OMPI_DDT_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + DO_DEBUG( opal_output( 0, "copy 2. memcpy( %p, %p, %ld ) => space %d\n", + _destination, _source, _copy_blength, *(SPACE) - (_i * _copy_blength) ); ); + MEMCPY( _destination, _source, _copy_blength ); + _source += _elem->extent; + _destination += _elem->extent; + } + _copy_blength *= _copy_count; + } + *(SPACE) -= _copy_blength; +} + +static inline void copy_contiguous_loop( const dt_elem_desc_t* ELEM, + const ompi_datatype_t* DATATYPE, + char* SOURCE_BASE, + int32_t TOTAL_COUNT, + uint32_t COUNT, + char* SOURCE, + char* DESTINATION, + uint32_t* SPACE ) +{ + ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM); + ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items); + char* _source = (SOURCE) + _end_loop->first_elem_disp; + char* _destination = (DESTINATION) + _end_loop->first_elem_disp; + size_t _copy_loops = (COUNT); + uint32_t _i; + + if( _loop->extent == (long)_end_loop->size ) { /* the loop is contiguous */ + _copy_loops *= _end_loop->size; + OMPI_DDT_SAFEGUARD_POINTER( _source, _copy_loops, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + MEMCPY( _destination, _source, _copy_loops ); + } else { + for( _i = 0; _i < _copy_loops; _i++ ) { + OMPI_DDT_SAFEGUARD_POINTER( _source, _end_loop->size, (SOURCE_BASE), + (DATATYPE), (TOTAL_COUNT) ); + DO_DEBUG( opal_output( 0, "copy 3. memcpy( %p, %p, %ld ) => space %ld\n", + _destination, _source, _end_loop->size, *(SPACE) - _i * _end_loop->size ); ); + MEMCPY( _destination, _source, _end_loop->size ); + _source += _loop->extent; + _destination += _loop->extent; + } + _copy_loops *= _end_loop->size; + } + *(SPACE) -= _copy_loops; +} + +#define COPY_PREDEFINED_DATATYPE( ELEM, DATATYPE, SOURCE_BASE, TOTAL_COUNT, COUNT, SOURCE, DESTINATION, SPACE ) \ + copy_predefined_data( (ELEM), (DATATYPE), (SOURCE_BASE), (TOTAL_COUNT), \ + (COUNT), (SOURCE), (DESTINATION), &(SPACE) ) + +#define COPY_CONTIGUOUS_LOOP( ELEM, DATATYPE, SOURCE_BASE, TOTAL_COUNT, COUNT, SOURCE, DESTINATION, SPACE ) \ + copy_contiguous_loop( (ELEM), (DATATYPE), (SOURCE_BASE), (TOTAL_COUNT), \ + (COUNT), (SOURCE), (DESTINATION), &(SPACE) ) + +#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \ + do { \ + (ELEMENT) = &((DESCRIPTION)[(POSITION)]); \ + (COUNTER) = (ELEMENT)->elem.count; \ + } while (0) + +int32_t ompi_ddt_copy_content_same_ddt( const ompi_datatype_t* datatype, int32_t count, + char* destination_base, char* source_base ) +{ + dt_stack_t* pStack; /* pointer to the position on the stack */ + int32_t stack_pos; /* index of the stack level */ + uint32_t pos_desc; /* actual position in the description of the derived datatype */ + uint32_t count_desc; /* the number of items already done in the actual pos_desc */ + dt_elem_desc_t* description; + dt_elem_desc_t* pElem; + uint32_t iov_len_local; + char *source = source_base, *destination = destination_base; + + DO_DEBUG( opal_output( 0, "ompi_ddt_copy_content_same_ddt( %p, %d, dst %p, src %p )\n", + datatype, count, destination_base, source_base ); ); + /* empty data ? then do nothing. This should normally be trapped + * at a higher level. + */ + if( 0 == count ) return 1; + + iov_len_local = count * datatype->size; + + /* If we have to copy a contiguous datatype then simply + * do a memcpy. + */ + if( (datatype->flags & DT_FLAG_CONTIGUOUS) == DT_FLAG_CONTIGUOUS ) { + long extent = (datatype->ub - datatype->lb); + /* Now that we know the datatype is contiguous, we should move the 2 pointers + * source and destination to the correct displacement. + */ + destination += datatype->lb; + source += datatype->lb; + if( (long)datatype->size == extent ) { /* all contiguous == no gaps around */ + size_t total_length = datatype->size * count; + size_t memcpy_chunk = ompi_datatype_memcpy_block_size; + while( total_length > 0 ) { + if( memcpy_chunk > total_length ) memcpy_chunk = total_length; + OMPI_DDT_SAFEGUARD_POINTER( destination, memcpy_chunk, + destination, datatype, count ); + DO_DEBUG( opal_output( 0, "copy c1. memcpy( %p, %p, %ld ) => space %d\n", + destination, source, memcpy_chunk, iov_len_local ); ); + MEMCPY( destination, source, memcpy_chunk ); + destination += memcpy_chunk; + source += memcpy_chunk; + total_length -= memcpy_chunk; + iov_len_local -= memcpy_chunk; + } + } else { + for( pos_desc = 0; (int32_t)pos_desc < count; pos_desc++ ) { + OMPI_DDT_SAFEGUARD_POINTER( destination, datatype->size, + destination, datatype, count ); + DO_DEBUG( opal_output( 0, "copy c2. memcpy( %p, %p, %ld ) => space %d\n", + destination, source, datatype->size, + iov_len_local - (pos_desc * datatype->size) ); ); + MEMCPY( destination, source, datatype->size ); + destination += extent; + source += extent; + } + } + return 0; + } + + pStack = alloca( sizeof(dt_stack_t) * (datatype->btypes[DT_LOOP] + 1) ); + pStack->count = count; + pStack->index = -1; + pStack->disp = 0; + pos_desc = 0; + stack_pos = 0; + + if( datatype->opt_desc.desc != NULL ) { + description = datatype->opt_desc.desc; + pStack->end_loop = datatype->opt_desc.used; + } else { + description = datatype->desc.desc; + pStack->end_loop = datatype->desc.used; + } + + if( description[0].elem.common.type == DT_LOOP ) + count_desc = description[0].loop.loops; + else + count_desc = description[0].elem.count; + pElem = &(description[pos_desc]); + + while( 1 ) { + if( DT_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */ + DO_DEBUG( opal_output( 0, "copy end_loop count %d stack_pos %d pos_desc %d disp %ld space %d\n", + pStack->count, stack_pos, pos_desc, pStack->disp, iov_len_local ); ); + if( --(pStack->count) == 0 ) { /* end of loop */ + if( stack_pos == 0 ) { + assert( iov_len_local == 0 ); + return 0; /* completed */ + } + stack_pos--; + pStack--; + pos_desc++; + } else { + pos_desc = pStack->index + 1; + if( pStack->index == -1 ) { + pStack->disp += (datatype->ub - datatype->lb); + } else { + assert( DT_LOOP == description[pStack->index].loop.common.type ); + pStack->disp += description[pStack->index].loop.extent; + } + } + source = source_base + pStack->disp; + destination = destination_base + pStack->disp; + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); + DO_DEBUG( opal_output( 0, "copy new_loop count %d stack_pos %d pos_desc %d disp %ld space %d\n", + pStack->count, stack_pos, pos_desc, pStack->disp, iov_len_local ); ); + } + if( DT_LOOP == pElem->elem.common.type ) { + long local_disp = (long)source; + if( pElem->loop.common.flags & DT_FLAG_CONTIGUOUS ) { + COPY_CONTIGUOUS_LOOP( pElem, datatype, source_base, count, count_desc, + source, destination, iov_len_local ); + assert( 0 == count_desc ); + pos_desc += pElem->loop.items + 1; + goto update_loop_description; + } + local_disp = (long)source - local_disp; + PUSH_STACK( pStack, stack_pos, pos_desc, DT_LOOP, count_desc, + pStack->disp + local_disp, pos_desc + pElem->elem.disp + 1); + pos_desc++; + update_loop_description: /* update the current state */ + source = source_base + pStack->disp; + destination = destination_base + pStack->disp; + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); + DDT_DUMP_STACK( pStack, stack_pos, pElem, "advance loop" ); + continue; + } + while( pElem->elem.common.flags & DT_FLAG_DATA ) { + /* now here we have a basic datatype */ + COPY_PREDEFINED_DATATYPE( pElem, datatype, source_base, count, count_desc, + source, destination, iov_len_local ); + pos_desc++; /* advance to the next data */ + UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc ); + } + } +}