1
1

Get rid of the division in the critical path.

Amazing how a bad instruction scheduling can have such a drastic impact
on the code performance. With this change, the get a boost of at least
50% on the performance of data with a small blocklen and/or count.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
Этот коммит содержится в:
George Bosilca 2019-07-10 00:28:29 -04:00
родитель a80255235a
Коммит 3562d70679
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 09C926752C9F09B1
2 изменённых файлов: 38 добавлений и 17 удалений

Просмотреть файл

@ -35,19 +35,24 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
size_t* SPACE )
{
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now, do_now_bytes;
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t cando_count = *(COUNT), do_now, do_now_bytes;
unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;
assert( *(COUNT) <= _elem->count * _elem->blocklen);
if( cando_count > *(COUNT) )
cando_count = *(COUNT);
if( (blocklen_bytes * cando_count) > *(SPACE) )
cando_count = (*SPACE) / blocklen_bytes;
do_now = *(COUNT); /* save the COUNT for later */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */
goto do_epilog;
}
if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
*(COUNT) -= cando_count;
for(; cando_count > 0; cando_count--) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
@ -59,17 +64,19 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
}
goto update_and_return;
}
blocklen_bytes *= _elem->blocklen;
blocklen_bytes *= _elem->blocklen;
if( (_elem->count * _elem->blocklen) == cando_count ) {
goto skip_prolog;
}
/**
* First check if we already did something on this element ? The COUNT is the number
* of remaining predefined types in the current elem, not how many predefined types
* should be manipulated in the current call (this number is instead reflected on the
* SPACE).
*/
do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
do_now = do_now % _elem->blocklen; /* any partial elements ? */
if( 0 != do_now ) {
size_t left_in_block = do_now; /* left in the current blocklen */
do_now = (do_now > cando_count ) ? cando_count : do_now;
@ -88,6 +95,7 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
cando_count -= do_now;
}
skip_prolog:
/* Do as many full blocklen as possible */
for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
@ -104,6 +112,8 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
* As an epilog do anything left from the last blocklen.
*/
if( 0 != cando_count ) {
do_epilog:
assert( cando_count < _elem->blocklen );
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,

Просмотреть файл

@ -35,19 +35,24 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
size_t* SPACE )
{
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now, do_now_bytes;
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t cando_count = (*COUNT), do_now, do_now_bytes;
unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;
assert( *(COUNT) <= (_elem->count * _elem->blocklen));
if( cando_count > *(COUNT) )
cando_count = *(COUNT);
if( (blocklen_bytes * cando_count) > *(SPACE) )
cando_count = (*SPACE) / blocklen_bytes;
do_now = *(COUNT); /* save the COUNT for later */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */
goto do_epilog;
}
if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
*(COUNT) -= cando_count;
for(; cando_count > 0; cando_count--) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
@ -59,7 +64,11 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
}
goto update_and_return;
}
blocklen_bytes *= _elem->blocklen;
if( (_elem->count * _elem->blocklen) == cando_count ) {
goto skip_prolog;
}
/**
* First check if we already did something on this element ? The COUNT is the number
@ -67,9 +76,8 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
* should be manipulated in the current call (this number is instead reflected on the
* SPACE).
*/
do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
do_now = do_now % _elem->blocklen; /* any partial elements ? */
if( 0 != do_now ) {
size_t left_in_block = do_now; /* left in the current blocklen */
do_now = (do_now > cando_count ) ? cando_count : do_now;
@ -88,6 +96,7 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
cando_count -= do_now;
}
skip_prolog:
/* Do as many full blocklen as possible */
for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
@ -104,6 +113,8 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
* As an epilog do anything left from the last blocklen.
*/
if( 0 != cando_count ) {
do_epilog:
assert( cando_count < _elem->blocklen );
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,