Decrease the creation time for a convertor. I target here specifically the contiguous data or/and the data with a short description. Each convertor
have now a limited stack attached. If we handle contiguous data then we will use this stack, avoiding the free/malloc for the stack management. In all others cases the convertor work as before a stack containing the required number of elements will be allocated. This small modification decrease the latency for GM by nearly 0.7 micro-sec as reported by NetPipe. This commit was SVN r3866.
Этот коммит содержится в:
родитель
fb203852d3
Коммит
92e94f5921
@ -173,13 +173,10 @@ OMPI_DECLSPEC int32_t ompi_ddt_copy_content_same_ddt( const dt_desc_t* pData, in
|
||||
|
||||
OMPI_DECLSPEC int32_t ompi_ddt_optimize_short( dt_desc_t* pData, int32_t count, dt_type_desc_t* pTypeDesc );
|
||||
|
||||
/* flags for the datatypes */
|
||||
|
||||
typedef int32_t (*conversion_fct_t)( uint32_t count,
|
||||
const void* from, uint32_t from_len, long from_extent,
|
||||
void* to, uint32_t in_length, long to_extent );
|
||||
|
||||
typedef struct __dt_stack dt_stack_t;
|
||||
typedef struct ompi_convertor_t ompi_convertor_t;
|
||||
typedef int32_t (*convertor_advance_fct_t)( ompi_convertor_t* pConvertor,
|
||||
struct iovec* pInputv,
|
||||
@ -188,21 +185,31 @@ typedef int32_t (*convertor_advance_fct_t)( ompi_convertor_t* pConvertor,
|
||||
int32_t* freeAfter );
|
||||
typedef void*(*memalloc_fct_t)( size_t* pLength );
|
||||
|
||||
typedef struct __dt_stack {
|
||||
int32_t index; /**< index in the element description */
|
||||
int32_t count; /**< number of times we still have to do it */
|
||||
int32_t end_loop; /**< for loops the end of the loop, otherwise useless */
|
||||
long disp; /**< actual displacement depending on the count field */
|
||||
} dt_stack_t;
|
||||
#define DT_STATIC_STACK_SIZE 5
|
||||
|
||||
struct ompi_convertor_t {
|
||||
ompi_object_t super; /**< basic superclass */
|
||||
dt_desc_t* pDesc; /**< the datatype description associated with the convertor */
|
||||
uint32_t remoteArch; /**< the remote architecture */
|
||||
dt_stack_t* pStack; /**< the local stack for the actual conversion */
|
||||
uint32_t converted; /**< the number of already converted elements */
|
||||
uint32_t bConverted; /**< the size of already converted elements in bytes */
|
||||
uint32_t flags; /**< the properties of this convertor */
|
||||
uint32_t count; /**< the total number of full datatype elements */
|
||||
uint32_t stack_pos; /**< the actual position on the stack */
|
||||
char* pBaseBuf; /**< initial buffer as supplied by the user */
|
||||
uint32_t available_space; /**< total available space */
|
||||
convertor_advance_fct_t fAdvance; /**< pointer to the pack/unpack functions */
|
||||
memalloc_fct_t memAlloc_fn; /**< pointer to the memory allocation function */
|
||||
conversion_fct_t* pFunctions; /**< the convertor functions pointer */
|
||||
ompi_object_t super; /**< basic superclass */
|
||||
dt_desc_t* pDesc; /**< the datatype description associated with the convertor */
|
||||
uint32_t remoteArch; /**< the remote architecture */
|
||||
uint32_t converted; /**< the number of already converted elements */
|
||||
uint32_t bConverted; /**< the size of already converted elements in bytes */
|
||||
uint32_t flags; /**< the properties of this convertor */
|
||||
uint32_t count; /**< the total number of full datatype elements */
|
||||
dt_stack_t* pStack; /**< the local stack for the actual conversion */
|
||||
uint32_t stack_pos; /**< the actual position on the stack */
|
||||
uint32_t stack_size; /**< size of the allocated stack */
|
||||
char* pBaseBuf; /**< initial buffer as supplied by the user */
|
||||
uint32_t available_space; /**< total available space */
|
||||
convertor_advance_fct_t fAdvance; /**< pointer to the pack/unpack functions */
|
||||
memalloc_fct_t memAlloc_fn; /**< pointer to the memory allocation function */
|
||||
conversion_fct_t* pFunctions; /**< the convertor functions pointer */
|
||||
dt_stack_t static_stack[DT_STATIC_STACK_SIZE]; /**< local stack to be used for contiguous data */
|
||||
};
|
||||
OBJ_CLASS_DECLARATION( ompi_convertor_t );
|
||||
|
||||
@ -260,8 +267,6 @@ static inline int32_t ompi_convertor_unpack( ompi_convertor_t* pConv,
|
||||
|
||||
/* and finally the convertor functions */
|
||||
OMPI_DECLSPEC ompi_convertor_t* ompi_convertor_create( int32_t remote_arch, int32_t mode );
|
||||
OMPI_DECLSPEC ompi_convertor_t* ompi_convertor_get_copy( const ompi_convertor_t* pConvertor );
|
||||
OMPI_DECLSPEC int32_t ompi_convertor_copy( const ompi_convertor_t* pSrcConv, ompi_convertor_t* pDestConv );
|
||||
OMPI_DECLSPEC int32_t ompi_convertor_init_for_send( ompi_convertor_t* pConv, uint32_t flags,
|
||||
const dt_desc_t* pData, int32_t count,
|
||||
const void* pUserBuf, int32_t local_starting_point,
|
||||
@ -274,6 +279,30 @@ OMPI_DECLSPEC int32_t ompi_convertor_need_buffers( ompi_convertor_t* pConvertor
|
||||
OMPI_DECLSPEC int32_t ompi_convertor_get_packed_size( const ompi_convertor_t* pConv, uint32_t* pSize );
|
||||
OMPI_DECLSPEC int32_t ompi_convertor_get_unpacked_size( const ompi_convertor_t* pConv, uint32_t* pSize );
|
||||
|
||||
static inline int ompi_convertor_copy( const ompi_convertor_t* pSrcConv, ompi_convertor_t* pDestConv )
|
||||
{
|
||||
pDestConv->pDesc = NULL;
|
||||
pDestConv->remoteArch = pSrcConv->remoteArch;
|
||||
/* Cleanup the old stack if any */
|
||||
if( pDestConv->stack_size > DT_STATIC_STACK_SIZE ) {
|
||||
free( pDestConv->pStack );
|
||||
}
|
||||
pDestConv->pStack = pDestConv->static_stack;
|
||||
pDestConv->stack_size = DT_STATIC_STACK_SIZE;
|
||||
pDestConv->stack_pos = 0;
|
||||
pDestConv->available_space = 0;
|
||||
pDestConv->pFunctions = pSrcConv->pFunctions;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline ompi_convertor_t* ompi_convertor_get_copy( const ompi_convertor_t* pConvertor )
|
||||
{
|
||||
ompi_convertor_t* pDestConv = OBJ_NEW(ompi_convertor_t);
|
||||
(void)ompi_convertor_copy( pConvertor, pDestConv );
|
||||
return pDestConv;
|
||||
}
|
||||
|
||||
/* temporary function prototypes. They should move in other place later. */
|
||||
OMPI_DECLSPEC int32_t ompi_ddt_get_args( const dt_desc_t* pData, int32_t which,
|
||||
int32_t * ci, int32_t * i,
|
||||
|
@ -102,17 +102,11 @@ static inline void DUMP( char* fmt, ... )
|
||||
#error DT_MAX_PREDEFINED should be updated
|
||||
#endif /* safe check for max predefined datatypes. */
|
||||
|
||||
#define DT_INCREASE_STACK 32
|
||||
#define DT_INCREASE_STACK 32
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
struct __dt_stack {
|
||||
int32_t index; /**< index in the element description */
|
||||
int32_t count; /**< number of times we still have to do it */
|
||||
int32_t end_loop; /**< for loops the end of the loop, otherwise useless */
|
||||
long disp; /**< actual displacement depending on the count field */
|
||||
};
|
||||
|
||||
/* These 2 typedefs are the same as the dt_elem_desc_t except
|
||||
* for the name of the fields.
|
||||
|
@ -760,12 +760,20 @@ int ompi_convertor_init_for_send( ompi_convertor_t* pConv,
|
||||
OBJ_RETAIN( datatype );
|
||||
if( pConv->pDesc != datatype ) {
|
||||
pConv->pDesc = (dt_desc_t*)datatype;
|
||||
if( pConv->pStack != NULL ) free( pConv->pStack );
|
||||
if( pConv->pStack != NULL ) {
|
||||
if( pConv->stack_size > DT_STATIC_STACK_SIZE )
|
||||
free( pConv->pStack );
|
||||
}
|
||||
pConv->pStack = NULL;
|
||||
}
|
||||
|
||||
if( pConv->pStack == NULL ) {
|
||||
pConv->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * (datatype->btypes[DT_LOOP] + 3) );
|
||||
pConv->stack_size = datatype->btypes[DT_LOOP] + 3;
|
||||
if( pConv->stack_size > DT_STATIC_STACK_SIZE ) {
|
||||
pConv->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * pConv->stack_size );
|
||||
} else {
|
||||
pConv->pStack = pConv->static_stack;
|
||||
}
|
||||
pConv->stack_pos = 0; /* just to be sure */
|
||||
}
|
||||
|
||||
@ -799,51 +807,31 @@ ompi_convertor_t* ompi_convertor_create( int remote_arch, int mode )
|
||||
{
|
||||
ompi_convertor_t* pConv = OBJ_NEW(ompi_convertor_t);
|
||||
|
||||
pConv->pDesc = NULL;
|
||||
pConv->pStack = NULL;
|
||||
pConv->remoteArch = remote_arch;
|
||||
pConv->fAdvance = NULL;
|
||||
pConv->memAlloc_fn = NULL;
|
||||
return pConv;
|
||||
}
|
||||
|
||||
static void ompi_convertor_construct( ompi_convertor_t* pConv )
|
||||
{
|
||||
pConv->pDesc = NULL;
|
||||
pConv->pStack = NULL;
|
||||
pConv->fAdvance = NULL;
|
||||
pConv->pDesc = NULL;
|
||||
pConv->pStack = pConv->static_stack;
|
||||
pConv->stack_size = DT_STATIC_STACK_SIZE;
|
||||
pConv->fAdvance = NULL;
|
||||
pConv->memAlloc_fn = NULL;
|
||||
}
|
||||
|
||||
static void ompi_convertor_destruct( ompi_convertor_t* pConv )
|
||||
{
|
||||
if( pConv->pStack != NULL ) free( pConv->pStack );
|
||||
pConv->pStack = NULL;
|
||||
if( pConv->pDesc != NULL ) OBJ_RELEASE( pConv->pDesc );
|
||||
pConv->pDesc = NULL;
|
||||
if( pConv->stack_size > DT_STATIC_STACK_SIZE ) {
|
||||
free( pConv->pStack );
|
||||
}
|
||||
|
||||
if( pConv->pDesc != NULL ) OBJ_RELEASE( pConv->pDesc );
|
||||
pConv->pDesc = NULL;
|
||||
}
|
||||
|
||||
OBJ_CLASS_INSTANCE(ompi_convertor_t, ompi_object_t, ompi_convertor_construct, ompi_convertor_destruct );
|
||||
|
||||
inline int ompi_convertor_copy( const ompi_convertor_t* pSrcConv, ompi_convertor_t* pDestConv )
|
||||
{
|
||||
pDestConv->pDesc = NULL;
|
||||
pDestConv->remoteArch = pSrcConv->remoteArch;
|
||||
pDestConv->pStack = NULL;
|
||||
pDestConv->stack_pos = 0;
|
||||
pDestConv->available_space = 0;
|
||||
pDestConv->pFunctions = pSrcConv->pFunctions;
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
ompi_convertor_t* ompi_convertor_get_copy( const ompi_convertor_t* pConvertor )
|
||||
{
|
||||
ompi_convertor_t* pDestConv = OBJ_NEW(ompi_convertor_t);
|
||||
(void)ompi_convertor_copy( pConvertor, pDestConv );
|
||||
return pDestConv;
|
||||
}
|
||||
|
||||
/* Actually we suppose that we can only do receiver side conversion */
|
||||
int ompi_convertor_get_packed_size( const ompi_convertor_t* pConv, uint32_t* pSize )
|
||||
{
|
||||
|
@ -613,12 +613,20 @@ int ompi_convertor_init_for_recv( ompi_convertor_t* pConv, uint32_t flags,
|
||||
if( pConv->pDesc != datatype ) {
|
||||
pConv->pDesc = (dt_desc_t*)datatype;
|
||||
pConv->flags = CONVERTOR_RECV;
|
||||
if( pConv->pStack != NULL ) free( pConv->pStack );
|
||||
if( pConv->pStack != NULL ) {
|
||||
if( pConv->stack_size > DT_STATIC_STACK_SIZE )
|
||||
free( pConv->pStack );
|
||||
}
|
||||
pConv->pStack = NULL;
|
||||
}
|
||||
if( pConv->pStack == NULL ) {
|
||||
pConv->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * (datatype->btypes[DT_LOOP] + 3) );
|
||||
pConv->stack_pos = 0;
|
||||
pConv->stack_size = datatype->btypes[DT_LOOP] + 3;
|
||||
if( pConv->stack_size > DT_STATIC_STACK_SIZE ) {
|
||||
pConv->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * pConv->stack_size );
|
||||
} else {
|
||||
pConv->pStack = pConv->static_stack;
|
||||
}
|
||||
pConv->stack_pos = 0; /* just to be sure */
|
||||
}
|
||||
|
||||
pConv->flags = CONVERTOR_RECV | CONVERTOR_HOMOGENEOUS;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user