1
1

Always build support for HETEROGENEOUS environment (this is needed to

provide external32 support). Add a pack function allowing to
provide send conversion (needed on little endian machine in
order to pack in the external32 format).
Этот коммит содержится в:
George Bosilca 2016-02-23 01:32:41 -06:00 коммит произвёл Gilles Gouaillardet
родитель 639f4b1086
Коммит cf2bb20bac
4 изменённых файлов: 277 добавлений и 57 удалений

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -449,16 +449,17 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
/**
* Compute the remote size.
* Compute the remote size. If necessary remove the homogeneous flag
* and redirect the convertor description toward the non-optimized
* datatype representation.
*/
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
#define OPAL_CONVERTOR_COMPUTE_REMOTE_SIZE(convertor, datatype, bdt_mask) \
{ \
if( OPAL_UNLIKELY(0 != (bdt_mask)) ) { \
opal_convertor_master_t* master; \
int i; \
uint32_t mask = datatype->bdt_used; \
convertor->flags ^= CONVERTOR_HOMOGENEOUS; \
convertor->flags &= (~CONVERTOR_HOMOGENEOUS); \
master = convertor->master; \
convertor->remote_size = 0; \
for( i = OPAL_DATATYPE_FIRST_TYPE; mask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++ ) { \
@ -472,13 +473,6 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
convertor->use_desc = &(datatype->desc); \
} \
}
#else
#define OPAL_CONVERTOR_COMPUTE_REMOTE_SIZE(convertor, datatype, bdt_mask) \
{ \
assert(0 == (bdt_mask)); \
(void)bdt_mask; /* silence compiler warning */ \
}
#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */
/**
* This macro will initialize a convertor based on a previously created
@ -511,16 +505,13 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
convertor->flags |= (CONVERTOR_NO_OP | CONVERTOR_HOMOGENEOUS); \
convertor->pDesc = (opal_datatype_t*)datatype; \
convertor->bConverted = 0; \
/* By default consider the optimized description */ \
convertor->use_desc = &(datatype->opt_desc); \
\
convertor->remote_size = convertor->local_size; \
if( OPAL_LIKELY(convertor->remoteArch == opal_local_arch) ) { \
if( (convertor->flags & (CONVERTOR_WITH_CHECKSUM | OPAL_DATATYPE_FLAG_NO_GAPS)) == OPAL_DATATYPE_FLAG_NO_GAPS ) { \
return OPAL_SUCCESS; \
} \
if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | OPAL_DATATYPE_FLAG_CONTIGUOUS)) \
== OPAL_DATATYPE_FLAG_CONTIGUOUS) && (1 == count) ) { \
if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && \
((convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) || \
((convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && (1 == count))) ) { \
return OPAL_SUCCESS; \
} \
} \
@ -532,8 +523,9 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
/* For predefined datatypes (contiguous) do nothing more */ \
/* if checksum is enabled then always continue */ \
if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | OPAL_DATATYPE_FLAG_NO_GAPS)) \
== OPAL_DATATYPE_FLAG_NO_GAPS) && \
(convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) { \
== OPAL_DATATYPE_FLAG_NO_GAPS) && \
((convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) == \
(CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) { \
return OPAL_SUCCESS; \
} \
convertor->flags &= ~CONVERTOR_NO_OP; \
@ -566,26 +558,24 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) {
convertor->fAdvance = opal_unpack_general_checksum;
} else
#endif
if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
convertor->fAdvance = opal_unpack_homogeneous_contig_checksum;
} else {
convertor->fAdvance = opal_generic_simple_unpack_checksum;
if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
convertor->fAdvance = opal_unpack_homogeneous_contig_checksum;
} else {
convertor->fAdvance = opal_generic_simple_unpack_checksum;
}
}
} else {
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) {
convertor->fAdvance = opal_unpack_general;
} else
#endif
if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
convertor->fAdvance = opal_unpack_homogeneous_contig;
} else {
convertor->fAdvance = opal_generic_simple_unpack;
if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
convertor->fAdvance = opal_unpack_homogeneous_contig;
} else {
convertor->fAdvance = opal_generic_simple_unpack;
}
}
}
return OPAL_SUCCESS;
@ -605,24 +595,32 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) {
if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
if( ((datatype->ub - datatype->lb) == (OPAL_PTRDIFF_TYPE)datatype->size)
|| (1 >= convertor->count) )
convertor->fAdvance = opal_pack_homogeneous_contig_checksum;
else
convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_checksum;
if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) {
convertor->fAdvance = opal_pack_general_checksum;
} else {
convertor->fAdvance = opal_generic_simple_pack_checksum;
if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
if( ((datatype->ub - datatype->lb) == (OPAL_PTRDIFF_TYPE)datatype->size)
|| (1 >= convertor->count) )
convertor->fAdvance = opal_pack_homogeneous_contig_checksum;
else
convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_checksum;
} else {
convertor->fAdvance = opal_generic_simple_pack_checksum;
}
}
} else {
if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
if( ((datatype->ub - datatype->lb) == (OPAL_PTRDIFF_TYPE)datatype->size)
|| (1 >= convertor->count) )
convertor->fAdvance = opal_pack_homogeneous_contig;
else
convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) {
convertor->fAdvance = opal_pack_general;
} else {
convertor->fAdvance = opal_generic_simple_pack;
if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
if( ((datatype->ub - datatype->lb) == (OPAL_PTRDIFF_TYPE)datatype->size)
|| (1 >= convertor->count) )
convertor->fAdvance = opal_pack_homogeneous_contig;
else
convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
} else {
convertor->fAdvance = opal_generic_simple_pack;
}
}
}
return OPAL_SUCCESS;
@ -678,15 +676,33 @@ int opal_convertor_clone( const opal_convertor_t* source,
void opal_convertor_dump( opal_convertor_t* convertor )
{
printf( "Convertor %p count %d stack position %d bConverted %ld\n", (void*)convertor,
convertor->count, convertor->stack_pos, (unsigned long)convertor->bConverted );
printf( "\tlocal_size %ld remote_size %ld flags %X stack_size %d pending_length %d\n",
(unsigned long)convertor->local_size, (unsigned long)convertor->remote_size,
convertor->flags, convertor->stack_size, convertor->partial_length );
opal_output( 0, "Convertor %p count %d stack position %d bConverted %ld\n"
"\tlocal_size %ld remote_size %ld flags %X stack_size %d pending_length %d\n"
"\tremote_arch %u local_arch %u\n",
(void*)convertor,
convertor->count, convertor->stack_pos, (unsigned long)convertor->bConverted,
(unsigned long)convertor->local_size, (unsigned long)convertor->remote_size,
convertor->flags, convertor->stack_size, convertor->partial_length,
convertor->remoteArch, opal_local_arch );
if( convertor->flags & CONVERTOR_RECV ) opal_output( 0, "unpack ");
if( convertor->flags & CONVERTOR_SEND ) opal_output( 0, "pack ");
if( convertor->flags & CONVERTOR_SEND_CONVERSION ) opal_output( 0, "conversion ");
if( convertor->flags & CONVERTOR_HOMOGENEOUS ) opal_output( 0, "homogeneous " );
else opal_output( 0, "heterogeneous ");
if( convertor->flags & CONVERTOR_NO_OP ) opal_output( 0, "no_op ");
if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) opal_output( 0, "checksum ");
if( convertor->flags & CONVERTOR_CUDA ) opal_output( 0, "CUDA ");
if( convertor->flags & CONVERTOR_CUDA_ASYNC ) opal_output( 0, "CUDA Async ");
if( convertor->flags & CONVERTOR_COMPLETED ) opal_output( 0, "COMPLETED ");
opal_datatype_dump( convertor->pDesc );
printf( "Actual stack representation\n" );
opal_datatype_dump_stack( convertor->pStack, convertor->stack_pos,
convertor->pDesc->desc.desc, convertor->pDesc->name );
if( !((0 == convertor->stack_pos) &&
((size_t)convertor->pStack[convertor->stack_pos].index > convertor->pDesc->desc.length)) ) {
/* only if the convertor is completely initialized */
opal_output( 0, "Actual stack representation\n" );
opal_datatype_dump_stack( convertor->pStack, convertor->stack_pos,
convertor->pDesc->desc.desc, convertor->pDesc->name );
}
}

Просмотреть файл

@ -175,9 +175,7 @@ static inline int opal_convertor_cleanup( opal_convertor_t* convertor )
*/
static inline int32_t opal_convertor_need_buffers( const opal_convertor_t* pConvertor )
{
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
if (OPAL_UNLIKELY(0 == (pConvertor->flags & CONVERTOR_HOMOGENEOUS))) return 1;
#endif
#if OPAL_CUDA_SUPPORT
if( pConvertor->flags & (CONVERTOR_CUDA | CONVERTOR_CUDA_UNIFIED)) return 1;
#endif

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2014 The University of Tennessee and The University
* Copyright (c) 2004-2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@ -42,10 +42,12 @@
#define opal_pack_homogeneous_contig_function opal_pack_homogeneous_contig_checksum
#define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_checksum
#define opal_generic_simple_pack_function opal_generic_simple_pack_checksum
#define opal_pack_general_function opal_pack_general_checksum
#else
#define opal_pack_homogeneous_contig_function opal_pack_homogeneous_contig
#define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps
#define opal_generic_simple_pack_function opal_generic_simple_pack
#define opal_pack_general_function opal_pack_general
#endif /* defined(CHECKSUM) */
@ -393,3 +395,199 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
return 0;
}
/*
* Remember that the first item in the stack (ie. position 0) is the number
* of times the datatype is involved in the operation (ie. the count argument
* in the MPI_ call).
*/
/* Convert data from multiple input buffers (as received from the network layer)
* to a contiguous output buffer with a predefined size.
* return OPAL_SUCCESS if everything went OK and if there is still room before the complete
* conversion of the data (need additional call with others input buffers )
* 1 if everything went fine and the data was completly converted
* -1 something wrong occurs.
*/
static inline void
pack_predefined_heterogeneous( opal_convertor_t* CONVERTOR,
const dt_elem_desc_t* ELEM,
uint32_t* COUNT,
unsigned char** SOURCE,
unsigned char** DESTINATION,
size_t* SPACE )
{
uint32_t _count = *(COUNT);
size_t _r_blength, _l_blength;
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
unsigned char* _source = (*SOURCE) + _elem->disp;
const opal_convertor_master_t* master = (CONVERTOR)->master;
OPAL_PTRDIFF_TYPE advance;
_r_blength = master->remote_sizes[_elem->common.type];
_l_blength = opal_datatype_basicDatatypes[_elem->common.type]->size;
if( (_count * _r_blength) > *(SPACE) ) {
_count = (uint32_t)(*(SPACE) / _r_blength);
if( 0 == _count ) return; /* nothing to do */
}
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, (_count * _elem->extent), (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
DO_DEBUG( opal_output( 0, "pack [l %s r %s] memcpy( %p, %p, %lu ) => space %lu\n",
((OPAL_PTRDIFF_TYPE)_l_blength == _elem->extent) ? "cont" : "----",
((OPAL_PTRDIFF_TYPE)_r_blength == _elem->extent) ? "cont" : "----",
*(DESTINATION), _source, (unsigned long)_r_blength,
(unsigned long)(*(SPACE)) ); );
master->pFunctions[_elem->common.type]( CONVERTOR, _count,
_source, *SPACE, _elem->extent,
*DESTINATION, *SPACE, _r_blength,
&advance );
_r_blength *= _count; /* update the remote length to encompass all the elements */
*(SOURCE) += _count * _elem->extent;
*(DESTINATION) += _r_blength;
*(SPACE) -= _r_blength;
*(COUNT) -= _count;
}
int32_t
opal_pack_general_function( opal_convertor_t* pConvertor,
struct iovec* iov, uint32_t* out_size,
size_t* max_data )
{
dt_stack_t* pStack; /* pointer to the position on the stack */
uint32_t pos_desc; /* actual position in the description of the derived datatype */
uint32_t count_desc; /* the number of items already done in the actual pos_desc */
size_t total_packed = 0; /* total amount packed this time */
dt_elem_desc_t* description;
dt_elem_desc_t* pElem;
const opal_datatype_t *pData = pConvertor->pDesc;
unsigned char *conv_ptr, *iov_ptr;
size_t iov_len_local;
uint32_t iov_count;
int type, rc;
const opal_convertor_master_t* master = pConvertor->master;
ptrdiff_t advance;
DO_DEBUG( opal_output( 0, "opal_convertor_general_pack( %p:%p, {%p, %lu}, %d )\n",
(void*)pConvertor, (void*)pConvertor->pBaseBuf,
iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
description = pConvertor->use_desc->desc;
/* For the first step we have to add both displacement to the source. After in the
* main while loop we will set back the conv_ptr to the correct value. This is
* due to the fact that the convertor can stop in the middle of a data with a count
*/
pStack = pConvertor->pStack + pConvertor->stack_pos;
pos_desc = pStack->index;
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
count_desc = (uint32_t)pStack->count;
pStack--;
pConvertor->stack_pos--;
pElem = &(description[pos_desc]);
DO_DEBUG( opal_output( 0, "pack start pos_desc %d count_desc %d disp %ld\n"
"stack_pos %d pos_desc %d count_desc %d disp %ld\n",
pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
iov_ptr = (unsigned char *) iov[iov_count].iov_base;
iov_len_local = iov[iov_count].iov_len;
while( 1 ) {
while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
type = description[pos_desc].elem.common.type;
/* now here we have a basic datatype */
DO_DEBUG( opal_output( 0, "pack (%p:%ld, %d, %ld) -> (%p, %ld) type %s\n",
pConvertor->pBaseBuf, conv_ptr + pElem->elem.disp - pConvertor->pBaseBuf,
count_desc, description[pos_desc].elem.extent,
iov_ptr, iov_len_local,
opal_datatype_basicDatatypes[type]->name ); );
pack_predefined_heterogeneous( pConvertor, pElem, &count_desc,
&conv_ptr, &iov_ptr, &iov_len_local);
#if 0
PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
conv_ptr, iov_ptr, iov_len_local );
#endif
if( 0 == count_desc ) { /* completed */
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
pos_desc++; /* advance to the next data */
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
continue;
}
goto complete_loop;
}
if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
" pos_desc %d disp %ld space %lu\n",
(int)pStack->count, pConvertor->stack_pos,
pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
if( --(pStack->count) == 0 ) { /* end of loop */
if( 0 == pConvertor->stack_pos ) {
/* we lie about the size of the next element in order to
* make sure we exit the main loop.
*/
*out_size = iov_count;
goto complete_loop; /* completed */
}
pConvertor->stack_pos--;
pStack--;
pos_desc++;
} else {
pos_desc = pStack->index + 1;
if( pStack->index == -1 ) {
pStack->disp += (pData->ub - pData->lb);
} else {
assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
pStack->disp += description[pStack->index].loop.extent;
}
}
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
(int)pStack->count, pConvertor->stack_pos, pos_desc,
count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
}
if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
#if 0
if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
conv_ptr, iov_ptr, iov_len_local );
if( 0 == count_desc ) { /* completed */
pos_desc += pElem->loop.items + 1;
goto update_loop_description;
}
/* Save the stack with the correct last_count value. */
}
#endif /* in a heterogeneous environment we can't handle the contiguous loops */
local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
pStack->disp + local_disp);
pos_desc++;
update_loop_description: /* update the current state */
conv_ptr = pConvertor->pBaseBuf + pStack->disp;
UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
continue;
}
}
complete_loop:
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
total_packed += iov[iov_count].iov_len;
}
*max_data = total_packed;
pConvertor->bConverted += total_packed; /* update the already converted bytes */
*out_size = iov_count;
if( pConvertor->bConverted == pConvertor->local_size ) {
pConvertor->flags |= CONVERTOR_COMPLETED;
return 1;
}
/* Save the global position for the next round */
PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
conv_ptr - pConvertor->pBaseBuf );
DO_DEBUG( opal_output( 0, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
return 0;
}

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
* Copyright (c) 2004-2009 The University of Tennessee and The University
* Copyright (c) 2004-2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2009 Oak Ridge National Labs. All rights reserved.
@ -24,6 +24,14 @@ BEGIN_C_DECLS
*/
OPAL_DECLSPEC int32_t
opal_pack_general( opal_convertor_t* pConvertor,
struct iovec* iov, uint32_t* out_size,
size_t* max_data );
OPAL_DECLSPEC int32_t
opal_pack_general_checksum( opal_convertor_t* pConvertor,
struct iovec* iov, uint32_t* out_size,
size_t* max_data );
OPAL_DECLSPEC int32_t
opal_unpack_general( opal_convertor_t* pConvertor,
struct iovec* iov, uint32_t* out_size,
size_t* max_data );