From f4da7a80bd194f97fae7a23f0a161de519cfb4b2 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Tue, 3 Oct 2006 08:13:16 +0000 Subject: [PATCH] Fine grain selection for heterogeneous environments. The hetero version of the conversion function are more complex and costly than a simple memcpy. Therefore, we want to decrease as much as possible the usage of these functions. We now check not only th HOMOGENEOUS flag on the datatype or convertor, but the bits indicating a type is in use. If a communication transfert a type having the same representation on both peers we can use the optimized version of the conversion. In same time we build a more accurate conversion table for each master convertor, based on the minimum differences between the 2 architectures. This commit was SVN r11945. --- ompi/datatype/convertor.c | 91 ++++++++++++++------ ompi/datatype/convertor_internal.h | 1 + ompi/datatype/copy_functions_heterogeneous.c | 2 +- ompi/datatype/datatype_internal.h | 48 ++++++----- 4 files changed, 96 insertions(+), 46 deletions(-) diff --git a/ompi/datatype/convertor.c b/ompi/datatype/convertor.c index f6649ba82a..7c3f94557f 100644 --- a/ompi/datatype/convertor.c +++ b/ompi/datatype/convertor.c @@ -52,6 +52,9 @@ OBJ_CLASS_INSTANCE(ompi_convertor_t, opal_object_t, ompi_convertor_construct, om static ompi_convertor_master_t* ompi_convertor_master_list = NULL; +extern conversion_fct_t ompi_ddt_heterogeneous_copy_functions[DT_MAX_PREDEFINED]; +extern conversion_fct_t ompi_ddt_copy_functions[DT_MAX_PREDEFINED]; + void ompi_convertor_destroy_masters( void ) { ompi_convertor_master_t* master = ompi_convertor_master_list; @@ -59,15 +62,23 @@ void ompi_convertor_destroy_masters( void ) while( NULL != master ) { ompi_convertor_master_list = master->next; master->next = NULL; + /* Cleanup the conversion function if not one of the defaults */ + if( (master->pFunctions != ompi_ddt_heterogeneous_copy_functions) && + (master->pFunctions != ompi_ddt_copy_functions) ) + free( master->pFunctions ); + free( master ); master = ompi_convertor_master_list; } } -extern conversion_fct_t ompi_ddt_heterogeneous_copy_functions[DT_MAX_PREDEFINED]; -extern conversion_fct_t ompi_ddt_copy_functions[DT_MAX_PREDEFINED]; - -ompi_convertor_master_t* ompi_convertor_find_or_create_master( uint32_t remote_arch ) +/** + * Find or create a convertor suitable for the remote architecture. If there + * is already a master convertor for this architecture then return it. + * Otherwise, create and initialize a full featured master convertor. + */ +ompi_convertor_master_t* +ompi_convertor_find_or_create_master( uint32_t remote_arch ) { ompi_convertor_master_t* master = ompi_convertor_master_list; int i; @@ -86,25 +97,37 @@ ompi_convertor_master_t* ompi_convertor_find_or_create_master( uint32_t remote_a ompi_convertor_master_list = master; master->remote_arch = remote_arch; master->flags = 0; + master->hetero_mask = 0; /* Most of the sizes will be identical, so for now just make a copy of * the local ones. As master->remote_sizes is defined as being an array of * consts we have to manually cast it before using it for writing purposes. */ remote_sizes = (int32_t*)master->remote_sizes; - for( i = DT_CHAR; i < DT_MAX_PREDEFINED; i++ ) { remote_sizes[i] = ompi_ddt_local_sizes[i]; } + /** + * If the local and remote architecture are the same there is no need + * to check for the remote data sizes. They will always be the same as + * the local ones. + */ + if( master->remote_arch == ompi_mpi_local_arch ) { + master->pFunctions = ompi_ddt_copy_functions; + master->flags |= CONVERTOR_HOMOGENEOUS; + return master; + } + /* Find out the remote bool size */ if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_BOOLIS8 ) ) { remote_sizes[DT_CXX_BOOL] = 1; - } else if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_LOGICALIS16 ) ) { + } else if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_BOOLIS16 ) ) { remote_sizes[DT_CXX_BOOL] = 2; - } else if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_LOGICALIS32 ) ) { + } else if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_BOOLIS32 ) ) { remote_sizes[DT_CXX_BOOL] = 4; } else { opal_output( 0, "Unknown sizeof(bool) for the remote architecture\n" ); } + /* check the length of the long */ if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_LONGIS64 ) ) { remote_sizes[DT_LONG] = 8; @@ -126,13 +149,38 @@ ompi_convertor_master_t* ompi_convertor_find_or_create_master( uint32_t remote_a opal_output( 0, "Unknown sizeof(fortran logical) for the remote architecture\n" ); } - if( master->remote_arch == ompi_mpi_local_arch ) { - master->pFunctions = ompi_ddt_copy_functions; - master->flags |= CONVERTOR_HOMOGENEOUS; - } else { - master->pFunctions = ompi_ddt_heterogeneous_copy_functions; + /** + * Now we can compute the conversion mask. For all sizes where the remote + * and local architecture differ a conversion is needed. Moreover, if the + * 2 architectures don't have the same endianess all data with a length + * over 2 bytes (with the exception of logicals) have to be byte-swapped. + */ + for( i = DT_CHAR; i < DT_MAX_PREDEFINED; i++ ) { + if( remote_sizes[i] != ompi_ddt_local_sizes[i] ) + master->hetero_mask |= (1 << i); } + if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_ISBIGENDIAN ) != + ompi_arch_checkmask( &ompi_mpi_local_arch, OMPI_ARCH_ISBIGENDIAN ) ) { + uint64_t hetero_mask = 0; + for( i = DT_CHAR; i < DT_MAX_PREDEFINED; i++ ) { + if( remote_sizes[i] > 2 ) + hetero_mask |= (1 << i); + } + hetero_mask &= ~((1 << DT_LOGIC) | (1 << DT_CXX_BOOL)); + master->hetero_mask |= hetero_mask; + } + master->pFunctions = malloc( sizeof(ompi_ddt_heterogeneous_copy_functions) ); + /** + * Usually the heterogeneous functions are slower than the copy ones. Let's + * try to minimize the usage of the heterogeneous versions. + */ + for( i = DT_CHAR; i < DT_MAX_PREDEFINED; i++ ) { + if( master->hetero_mask & (1 << i) ) + master->pFunctions[i] = ompi_ddt_heterogeneous_copy_functions[i]; + else + master->pFunctions[i] = ompi_ddt_copy_functions[i]; + } /* We're done so far, return the mater convertor */ return master; } @@ -378,16 +426,17 @@ int32_t ompi_convertor_set_position_nocheck( ompi_convertor_t* convertor, */ #define OMPI_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ) \ { \ + uint64_t bdt_mask = 0; \ convertor->pBaseBuf = (char*)pUserBuf; \ convertor->count = count; \ \ + /* Compute the local and remote sizes */ \ + convertor->local_size = convertor->count * datatype->size; \ /* Grab the datatype part of the flags */ \ convertor->flags &= CONVERTOR_TYPE_MASK; \ convertor->flags |= (CONVERTOR_DATATYPE_MASK & datatype->flags); \ convertor->pDesc = (ompi_datatype_t*)datatype; \ \ - /* Compute the local and remote sizes */ \ - convertor->local_size = convertor->count * datatype->size; \ /* If the data is empty we just mark the convertor as \ * completed. With this flag set the pack and unpack functions \ * will not do anything. In order to decrease the data \ @@ -405,9 +454,9 @@ int32_t ompi_convertor_set_position_nocheck( ompi_convertor_t* convertor, convertor->use_desc = &(datatype->opt_desc); \ convertor->flags |= CONVERTOR_HOMOGENEOUS; \ } else { \ - int i; \ - uint64_t bdt_mask = datatype->bdt_used >> DT_CHAR; \ ompi_convertor_master_t* master; \ + int i; \ + bdt_mask = datatype->bdt_used >> DT_CHAR; \ master = convertor->master; \ convertor->remote_size = 0; \ for( i = DT_CHAR; bdt_mask != 0; i++, bdt_mask >>= 1 ) { \ @@ -418,10 +467,7 @@ int32_t ompi_convertor_set_position_nocheck( ompi_convertor_t* convertor, } \ convertor->remote_size *= convertor->count; \ convertor->use_desc = &(datatype->desc); \ - if( convertor->flags & CONVERTOR_SEND ) { \ - convertor->bConverted = 0; \ - return OMPI_SUCCESS; \ - } \ + bdt_mask = datatype->bdt_used & master->hetero_mask; \ } \ assert( NULL != convertor->use_desc->desc ); \ /* For predefined datatypes (contiguous) do nothing more */ \ @@ -429,11 +475,10 @@ int32_t ompi_convertor_set_position_nocheck( ompi_convertor_t* convertor, if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && \ (convertor->flags & DT_FLAG_NO_GAPS) && \ ((convertor->flags & CONVERTOR_SEND) || \ - (convertor->flags & CONVERTOR_HOMOGENEOUS)) ) { \ + (0 == bdt_mask)) ) { \ convertor->bConverted = 0; \ return OMPI_SUCCESS; \ } \ - \ { \ uint32_t required_stack_length = datatype->btypes[DT_LOOP] + 1; \ \ @@ -503,7 +548,6 @@ ompi_convertor_prepare_for_send( ompi_convertor_t* convertor, if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) { if( datatype->flags & DT_FLAG_CONTIGUOUS ) { - assert( convertor->flags & DT_FLAG_CONTIGUOUS ); if( ((datatype->ub - datatype->lb) == (long)datatype->size) || (1 >= convertor->count) ) convertor->fAdvance = ompi_pack_homogeneous_contig_checksum; @@ -514,7 +558,6 @@ ompi_convertor_prepare_for_send( ompi_convertor_t* convertor, } } else { if( datatype->flags & DT_FLAG_CONTIGUOUS ) { - assert( convertor->flags & DT_FLAG_CONTIGUOUS ); if( ((datatype->ub - datatype->lb) == (long)datatype->size) || (1 >= convertor->count) ) convertor->fAdvance = ompi_pack_homogeneous_contig; diff --git a/ompi/datatype/convertor_internal.h b/ompi/datatype/convertor_internal.h index 3ca912c529..dca5f2d903 100644 --- a/ompi/datatype/convertor_internal.h +++ b/ompi/datatype/convertor_internal.h @@ -26,6 +26,7 @@ typedef struct ompi_convertor_master_t { struct ompi_convertor_master_t* next; uint32_t remote_arch; uint32_t flags; + uint64_t hetero_mask; const int32_t remote_sizes[DT_MAX_PREDEFINED]; conversion_fct_t* pFunctions; /**< the convertor functions pointer */ } ompi_convertor_master_t; diff --git a/ompi/datatype/copy_functions_heterogeneous.c b/ompi/datatype/copy_functions_heterogeneous.c index 8ba8aa47f7..4f11ef2e04 100644 --- a/ompi/datatype/copy_functions_heterogeneous.c +++ b/ompi/datatype/copy_functions_heterogeneous.c @@ -156,7 +156,7 @@ copy_2complex_##TYPENAME##_heterogeneous(ompi_convertor_t *pConvertor, uint32_t } \ } \ *advance = count * from_extent; \ - return count; \ + return count; \ } diff --git a/ompi/datatype/datatype_internal.h b/ompi/datatype/datatype_internal.h index f58e08d5a6..f710220905 100644 --- a/ompi/datatype/datatype_internal.h +++ b/ompi/datatype/datatype_internal.h @@ -150,26 +150,29 @@ typedef struct ddt_elem_id_description ddt_elem_id_description; * by a set of basic elements. */ struct ddt_elem_desc { - ddt_elem_id_description common; /**< basic data description and flags */ - uint32_t count; /**< number of elements */ - long disp; /**< displacement of the first element */ - int32_t extent; /**< extent of each element */ + ddt_elem_id_description common; /**< basic data description and flags */ + uint32_t count; /**< number of blocks */ + uint32_t blocklen; /**< number of elements on each block */ + int32_t extent; /**< extent of each block (in bytes) */ + long disp; /**< displacement of the first block */ }; typedef struct ddt_elem_desc ddt_elem_desc_t; struct ddt_loop_desc { ddt_elem_id_description common; /**< basic data description and flags */ uint32_t loops; /**< number of elements */ - long extent; /**< extent of the whole loop */ + uint32_t unused; /**< not used right now */ uint32_t items; /**< number of items in the loop */ + long extent; /**< extent of the whole loop */ }; typedef struct ddt_loop_desc ddt_loop_desc_t; struct ddt_endloop_desc { ddt_elem_id_description common; /**< basic data description and flags */ uint32_t items; /**< number of elements */ - long first_elem_disp; /**< total extent of the loop taking in account the repetitions */ + uint32_t unused; /**< not used right now */ uint32_t size; /**< real size of the data in the loop */ + long first_elem_disp; /**< the displacement of the first block in the loop */ }; typedef struct ddt_endloop_desc ddt_endloop_desc_t; @@ -180,13 +183,14 @@ union dt_elem_desc { }; #define CREATE_LOOP_START( _place, _count, _items, _extent, _flags ) \ -do { \ - (_place)->loop.common.type = DT_LOOP; \ - (_place)->loop.common.flags = (_flags) & ~DT_FLAG_DATA; \ - (_place)->loop.loops = (_count); \ - (_place)->loop.items = (_items); \ - (_place)->loop.extent = (_extent); \ -} while(0) + do { \ + (_place)->loop.common.type = DT_LOOP; \ + (_place)->loop.common.flags = (_flags) & ~DT_FLAG_DATA; \ + (_place)->loop.loops = (_count); \ + (_place)->loop.items = (_items); \ + (_place)->loop.extent = (_extent); \ + (_place)->loop.unused = -1; \ + } while(0) #define CREATE_LOOP_END( _place, _items, _first_item_disp, _size, _flags ) \ do { \ @@ -195,16 +199,18 @@ do { \ (_place)->end_loop.items = (_items); \ (_place)->end_loop.first_elem_disp = (_first_item_disp); \ (_place)->end_loop.size = (_size); /* the size inside the loop */ \ + (_place)->end_loop.unused = -1; \ } while(0) -#define CREATE_ELEM( _place, _type, _flags, _count, _disp, _extent ) \ -do { \ - (_place)->elem.common.flags = (_flags) | DT_FLAG_DATA_C | DT_FLAG_DATA; \ - (_place)->elem.common.type = (_type); \ - (_place)->elem.count = (_count); \ - (_place)->elem.disp = (_disp); \ - (_place)->elem.extent = (_extent); \ -} while(0) +#define CREATE_ELEM( _place, _type, _flags, _count, _disp, _extent ) \ + do { \ + (_place)->elem.common.flags = (_flags) | DT_FLAG_DATA_C | DT_FLAG_DATA; \ + (_place)->elem.common.type = (_type); \ + (_place)->elem.count = (_count); \ + (_place)->elem.disp = (_disp); \ + (_place)->elem.extent = (_extent); \ + (_place)->elem.blocklen = 1; \ + } while(0) typedef struct { float r;