diff --git a/ompi/datatype/convertor.c b/ompi/datatype/convertor.c index f6649ba82a..7c3f94557f 100644 --- a/ompi/datatype/convertor.c +++ b/ompi/datatype/convertor.c @@ -52,6 +52,9 @@ OBJ_CLASS_INSTANCE(ompi_convertor_t, opal_object_t, ompi_convertor_construct, om static ompi_convertor_master_t* ompi_convertor_master_list = NULL; +extern conversion_fct_t ompi_ddt_heterogeneous_copy_functions[DT_MAX_PREDEFINED]; +extern conversion_fct_t ompi_ddt_copy_functions[DT_MAX_PREDEFINED]; + void ompi_convertor_destroy_masters( void ) { ompi_convertor_master_t* master = ompi_convertor_master_list; @@ -59,15 +62,23 @@ void ompi_convertor_destroy_masters( void ) while( NULL != master ) { ompi_convertor_master_list = master->next; master->next = NULL; + /* Cleanup the conversion function if not one of the defaults */ + if( (master->pFunctions != ompi_ddt_heterogeneous_copy_functions) && + (master->pFunctions != ompi_ddt_copy_functions) ) + free( master->pFunctions ); + free( master ); master = ompi_convertor_master_list; } } -extern conversion_fct_t ompi_ddt_heterogeneous_copy_functions[DT_MAX_PREDEFINED]; -extern conversion_fct_t ompi_ddt_copy_functions[DT_MAX_PREDEFINED]; - -ompi_convertor_master_t* ompi_convertor_find_or_create_master( uint32_t remote_arch ) +/** + * Find or create a convertor suitable for the remote architecture. If there + * is already a master convertor for this architecture then return it. + * Otherwise, create and initialize a full featured master convertor. + */ +ompi_convertor_master_t* +ompi_convertor_find_or_create_master( uint32_t remote_arch ) { ompi_convertor_master_t* master = ompi_convertor_master_list; int i; @@ -86,25 +97,37 @@ ompi_convertor_master_t* ompi_convertor_find_or_create_master( uint32_t remote_a ompi_convertor_master_list = master; master->remote_arch = remote_arch; master->flags = 0; + master->hetero_mask = 0; /* Most of the sizes will be identical, so for now just make a copy of * the local ones. As master->remote_sizes is defined as being an array of * consts we have to manually cast it before using it for writing purposes. */ remote_sizes = (int32_t*)master->remote_sizes; - for( i = DT_CHAR; i < DT_MAX_PREDEFINED; i++ ) { remote_sizes[i] = ompi_ddt_local_sizes[i]; } + /** + * If the local and remote architecture are the same there is no need + * to check for the remote data sizes. They will always be the same as + * the local ones. + */ + if( master->remote_arch == ompi_mpi_local_arch ) { + master->pFunctions = ompi_ddt_copy_functions; + master->flags |= CONVERTOR_HOMOGENEOUS; + return master; + } + /* Find out the remote bool size */ if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_BOOLIS8 ) ) { remote_sizes[DT_CXX_BOOL] = 1; - } else if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_LOGICALIS16 ) ) { + } else if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_BOOLIS16 ) ) { remote_sizes[DT_CXX_BOOL] = 2; - } else if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_LOGICALIS32 ) ) { + } else if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_BOOLIS32 ) ) { remote_sizes[DT_CXX_BOOL] = 4; } else { opal_output( 0, "Unknown sizeof(bool) for the remote architecture\n" ); } + /* check the length of the long */ if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_LONGIS64 ) ) { remote_sizes[DT_LONG] = 8; @@ -126,13 +149,38 @@ ompi_convertor_master_t* ompi_convertor_find_or_create_master( uint32_t remote_a opal_output( 0, "Unknown sizeof(fortran logical) for the remote architecture\n" ); } - if( master->remote_arch == ompi_mpi_local_arch ) { - master->pFunctions = ompi_ddt_copy_functions; - master->flags |= CONVERTOR_HOMOGENEOUS; - } else { - master->pFunctions = ompi_ddt_heterogeneous_copy_functions; + /** + * Now we can compute the conversion mask. For all sizes where the remote + * and local architecture differ a conversion is needed. Moreover, if the + * 2 architectures don't have the same endianess all data with a length + * over 2 bytes (with the exception of logicals) have to be byte-swapped. + */ + for( i = DT_CHAR; i < DT_MAX_PREDEFINED; i++ ) { + if( remote_sizes[i] != ompi_ddt_local_sizes[i] ) + master->hetero_mask |= (1 << i); } + if( ompi_arch_checkmask( &master->remote_arch, OMPI_ARCH_ISBIGENDIAN ) != + ompi_arch_checkmask( &ompi_mpi_local_arch, OMPI_ARCH_ISBIGENDIAN ) ) { + uint64_t hetero_mask = 0; + for( i = DT_CHAR; i < DT_MAX_PREDEFINED; i++ ) { + if( remote_sizes[i] > 2 ) + hetero_mask |= (1 << i); + } + hetero_mask &= ~((1 << DT_LOGIC) | (1 << DT_CXX_BOOL)); + master->hetero_mask |= hetero_mask; + } + master->pFunctions = malloc( sizeof(ompi_ddt_heterogeneous_copy_functions) ); + /** + * Usually the heterogeneous functions are slower than the copy ones. Let's + * try to minimize the usage of the heterogeneous versions. + */ + for( i = DT_CHAR; i < DT_MAX_PREDEFINED; i++ ) { + if( master->hetero_mask & (1 << i) ) + master->pFunctions[i] = ompi_ddt_heterogeneous_copy_functions[i]; + else + master->pFunctions[i] = ompi_ddt_copy_functions[i]; + } /* We're done so far, return the mater convertor */ return master; } @@ -378,16 +426,17 @@ int32_t ompi_convertor_set_position_nocheck( ompi_convertor_t* convertor, */ #define OMPI_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ) \ { \ + uint64_t bdt_mask = 0; \ convertor->pBaseBuf = (char*)pUserBuf; \ convertor->count = count; \ \ + /* Compute the local and remote sizes */ \ + convertor->local_size = convertor->count * datatype->size; \ /* Grab the datatype part of the flags */ \ convertor->flags &= CONVERTOR_TYPE_MASK; \ convertor->flags |= (CONVERTOR_DATATYPE_MASK & datatype->flags); \ convertor->pDesc = (ompi_datatype_t*)datatype; \ \ - /* Compute the local and remote sizes */ \ - convertor->local_size = convertor->count * datatype->size; \ /* If the data is empty we just mark the convertor as \ * completed. With this flag set the pack and unpack functions \ * will not do anything. In order to decrease the data \ @@ -405,9 +454,9 @@ int32_t ompi_convertor_set_position_nocheck( ompi_convertor_t* convertor, convertor->use_desc = &(datatype->opt_desc); \ convertor->flags |= CONVERTOR_HOMOGENEOUS; \ } else { \ - int i; \ - uint64_t bdt_mask = datatype->bdt_used >> DT_CHAR; \ ompi_convertor_master_t* master; \ + int i; \ + bdt_mask = datatype->bdt_used >> DT_CHAR; \ master = convertor->master; \ convertor->remote_size = 0; \ for( i = DT_CHAR; bdt_mask != 0; i++, bdt_mask >>= 1 ) { \ @@ -418,10 +467,7 @@ int32_t ompi_convertor_set_position_nocheck( ompi_convertor_t* convertor, } \ convertor->remote_size *= convertor->count; \ convertor->use_desc = &(datatype->desc); \ - if( convertor->flags & CONVERTOR_SEND ) { \ - convertor->bConverted = 0; \ - return OMPI_SUCCESS; \ - } \ + bdt_mask = datatype->bdt_used & master->hetero_mask; \ } \ assert( NULL != convertor->use_desc->desc ); \ /* For predefined datatypes (contiguous) do nothing more */ \ @@ -429,11 +475,10 @@ int32_t ompi_convertor_set_position_nocheck( ompi_convertor_t* convertor, if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && \ (convertor->flags & DT_FLAG_NO_GAPS) && \ ((convertor->flags & CONVERTOR_SEND) || \ - (convertor->flags & CONVERTOR_HOMOGENEOUS)) ) { \ + (0 == bdt_mask)) ) { \ convertor->bConverted = 0; \ return OMPI_SUCCESS; \ } \ - \ { \ uint32_t required_stack_length = datatype->btypes[DT_LOOP] + 1; \ \ @@ -503,7 +548,6 @@ ompi_convertor_prepare_for_send( ompi_convertor_t* convertor, if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) { if( datatype->flags & DT_FLAG_CONTIGUOUS ) { - assert( convertor->flags & DT_FLAG_CONTIGUOUS ); if( ((datatype->ub - datatype->lb) == (long)datatype->size) || (1 >= convertor->count) ) convertor->fAdvance = ompi_pack_homogeneous_contig_checksum; @@ -514,7 +558,6 @@ ompi_convertor_prepare_for_send( ompi_convertor_t* convertor, } } else { if( datatype->flags & DT_FLAG_CONTIGUOUS ) { - assert( convertor->flags & DT_FLAG_CONTIGUOUS ); if( ((datatype->ub - datatype->lb) == (long)datatype->size) || (1 >= convertor->count) ) convertor->fAdvance = ompi_pack_homogeneous_contig; diff --git a/ompi/datatype/convertor_internal.h b/ompi/datatype/convertor_internal.h index 3ca912c529..dca5f2d903 100644 --- a/ompi/datatype/convertor_internal.h +++ b/ompi/datatype/convertor_internal.h @@ -26,6 +26,7 @@ typedef struct ompi_convertor_master_t { struct ompi_convertor_master_t* next; uint32_t remote_arch; uint32_t flags; + uint64_t hetero_mask; const int32_t remote_sizes[DT_MAX_PREDEFINED]; conversion_fct_t* pFunctions; /**< the convertor functions pointer */ } ompi_convertor_master_t; diff --git a/ompi/datatype/copy_functions_heterogeneous.c b/ompi/datatype/copy_functions_heterogeneous.c index 8ba8aa47f7..4f11ef2e04 100644 --- a/ompi/datatype/copy_functions_heterogeneous.c +++ b/ompi/datatype/copy_functions_heterogeneous.c @@ -156,7 +156,7 @@ copy_2complex_##TYPENAME##_heterogeneous(ompi_convertor_t *pConvertor, uint32_t } \ } \ *advance = count * from_extent; \ - return count; \ + return count; \ } diff --git a/ompi/datatype/datatype_internal.h b/ompi/datatype/datatype_internal.h index f58e08d5a6..f710220905 100644 --- a/ompi/datatype/datatype_internal.h +++ b/ompi/datatype/datatype_internal.h @@ -150,26 +150,29 @@ typedef struct ddt_elem_id_description ddt_elem_id_description; * by a set of basic elements. */ struct ddt_elem_desc { - ddt_elem_id_description common; /**< basic data description and flags */ - uint32_t count; /**< number of elements */ - long disp; /**< displacement of the first element */ - int32_t extent; /**< extent of each element */ + ddt_elem_id_description common; /**< basic data description and flags */ + uint32_t count; /**< number of blocks */ + uint32_t blocklen; /**< number of elements on each block */ + int32_t extent; /**< extent of each block (in bytes) */ + long disp; /**< displacement of the first block */ }; typedef struct ddt_elem_desc ddt_elem_desc_t; struct ddt_loop_desc { ddt_elem_id_description common; /**< basic data description and flags */ uint32_t loops; /**< number of elements */ - long extent; /**< extent of the whole loop */ + uint32_t unused; /**< not used right now */ uint32_t items; /**< number of items in the loop */ + long extent; /**< extent of the whole loop */ }; typedef struct ddt_loop_desc ddt_loop_desc_t; struct ddt_endloop_desc { ddt_elem_id_description common; /**< basic data description and flags */ uint32_t items; /**< number of elements */ - long first_elem_disp; /**< total extent of the loop taking in account the repetitions */ + uint32_t unused; /**< not used right now */ uint32_t size; /**< real size of the data in the loop */ + long first_elem_disp; /**< the displacement of the first block in the loop */ }; typedef struct ddt_endloop_desc ddt_endloop_desc_t; @@ -180,13 +183,14 @@ union dt_elem_desc { }; #define CREATE_LOOP_START( _place, _count, _items, _extent, _flags ) \ -do { \ - (_place)->loop.common.type = DT_LOOP; \ - (_place)->loop.common.flags = (_flags) & ~DT_FLAG_DATA; \ - (_place)->loop.loops = (_count); \ - (_place)->loop.items = (_items); \ - (_place)->loop.extent = (_extent); \ -} while(0) + do { \ + (_place)->loop.common.type = DT_LOOP; \ + (_place)->loop.common.flags = (_flags) & ~DT_FLAG_DATA; \ + (_place)->loop.loops = (_count); \ + (_place)->loop.items = (_items); \ + (_place)->loop.extent = (_extent); \ + (_place)->loop.unused = -1; \ + } while(0) #define CREATE_LOOP_END( _place, _items, _first_item_disp, _size, _flags ) \ do { \ @@ -195,16 +199,18 @@ do { \ (_place)->end_loop.items = (_items); \ (_place)->end_loop.first_elem_disp = (_first_item_disp); \ (_place)->end_loop.size = (_size); /* the size inside the loop */ \ + (_place)->end_loop.unused = -1; \ } while(0) -#define CREATE_ELEM( _place, _type, _flags, _count, _disp, _extent ) \ -do { \ - (_place)->elem.common.flags = (_flags) | DT_FLAG_DATA_C | DT_FLAG_DATA; \ - (_place)->elem.common.type = (_type); \ - (_place)->elem.count = (_count); \ - (_place)->elem.disp = (_disp); \ - (_place)->elem.extent = (_extent); \ -} while(0) +#define CREATE_ELEM( _place, _type, _flags, _count, _disp, _extent ) \ + do { \ + (_place)->elem.common.flags = (_flags) | DT_FLAG_DATA_C | DT_FLAG_DATA; \ + (_place)->elem.common.type = (_type); \ + (_place)->elem.count = (_count); \ + (_place)->elem.disp = (_disp); \ + (_place)->elem.extent = (_extent); \ + (_place)->elem.blocklen = 1; \ + } while(0) typedef struct { float r;