1
1

Complete the OUT protocol. Small latency improvements. Some minor cleanups.

Create some macros, reorder some functions. Make sure all fragments are
correctly released at the end.

This commit was SVN r12926.
Этот коммит содержится в:
George Bosilca 2006-12-26 18:15:24 +00:00
родитель 75a35ed7ee
Коммит ff2319dcb7
4 изменённых файлов: 166 добавлений и 162 удалений

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -17,7 +17,6 @@
*/
#include "ompi_config.h"
#include <string.h>
#include "opal/util/output.h"
#include "opal/util/if.h"
#include "ompi/mca/pml/pml.h"
@ -50,10 +49,10 @@ int mca_btl_mx_add_procs( struct mca_btl_base_module_t* btl,
mca_btl_base_endpoint_t* mx_endpoint;
/**
* By default don't allow communications with itself nor
* with any other processes on the same node.
* BTL self and sm are supposed to take care of such communications.
*/
* By default don't allow communications with self nor with any
* other processes on the same node. The BTL self and sm are
* supposed to take care of such communications.
*/
if( ompi_procs[i]->proc_flags & OMPI_PROC_FLAG_LOCAL ) {
if( ompi_procs[i] == ompi_proc_local_proc ) {
if( 0 == mca_btl_mx_component.mx_support_self )
@ -139,8 +138,8 @@ int mca_btl_mx_register( struct mca_btl_base_module_t* btl,
frag->mx_frag_list = NULL;
frag->tag = tag;
mx_segment.segment_ptr = frag->base.des_dst->seg_addr.pval;
mx_segment.segment_length = frag->base.des_dst->seg_len;
mx_segment.segment_ptr = (void*)(frag+1);
mx_segment.segment_length = mx_btl->super.btl_eager_limit;
mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1, (uint64_t)tag,
BTL_MX_RECV_MASK,
frag, &(frag->mx_request) );
@ -183,7 +182,8 @@ mca_btl_base_descriptor_t* mca_btl_mx_alloc( struct mca_btl_base_module_t* btl,
MCA_BTL_MX_FRAG_ALLOC_EAGER(mx_btl, frag, rc);
frag->segment[0].seg_len =
size <= mx_btl->super.btl_eager_limit ?
size : mx_btl->super.btl_eager_limit ;
size : mx_btl->super.btl_eager_limit ;
frag->segment[0].seg_addr.pval = (void*)(frag+1);
frag->base.des_src = frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
@ -202,11 +202,9 @@ int mca_btl_mx_free( struct mca_btl_base_module_t* btl,
{
mca_btl_mx_frag_t* frag = (mca_btl_mx_frag_t*)des;
if( 0 == frag->base.des_dst_cnt ) { /* send fragment */
MCA_BTL_MX_FRAG_RETURN(btl, frag);
} else { /* receive fragment */
opal_output( 0, "BARFFFFFFF return send frag\n" );
}
assert( 0xff == frag->tag );
MCA_BTL_MX_FRAG_RETURN(btl, frag);
return OMPI_SUCCESS;
}
@ -217,65 +215,68 @@ int mca_btl_mx_free( struct mca_btl_base_module_t* btl,
* @param btl (IN) BTL module
* @param peer (IN) BTL peer addressing
*/
mca_btl_base_descriptor_t* mca_btl_mx_prepare_src( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
size_t reserve,
size_t* size )
mca_btl_base_descriptor_t*
mca_btl_mx_prepare_src( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_mpool_base_registration_t* registration,
struct ompi_convertor_t* convertor,
size_t reserve,
size_t* size )
{
mca_btl_mx_frag_t* frag;
struct iovec iov;
uint32_t iov_count = 1;
size_t max_data = *size;
size_t max_data;
int rc;
max_data = btl->btl_eager_limit - reserve;
if( (*size) < max_data ) {
max_data = *size;
}
/* If the data is contiguous we can use directly the pointer
* to the user memory.
*/
if( 0 == ompi_convertor_need_buffers(convertor) ) {
MCA_BTL_MX_FRAG_ALLOC_USER(btl, frag, rc);
if( NULL == frag ) {
return NULL;
if( 0 == reserve ) {
MCA_BTL_MX_FRAG_ALLOC_USER(btl, frag, rc);
if( NULL == frag ) {
return NULL;
}
max_data = *size;
frag->base.des_src_cnt = 1;
} else {
MCA_BTL_MX_FRAG_ALLOC_EAGER( mx_btl, frag, rc );
if( NULL == frag ) {
return NULL;
}
frag->base.des_src_cnt = 2;
}
if( (max_data + reserve) > btl->btl_eager_limit ) {
max_data = btl->btl_eager_limit - reserve;
}
/* let the convertor figure out the correct pointer depending on the data layout */
/**
* let the convertor figure out the correct pointer depending
* on the data layout
*/
iov.iov_base = NULL;
iov.iov_len = max_data;
frag->base.des_src_cnt = 2;
frag->segment[0].seg_len = reserve;
} else {
MCA_BTL_MX_FRAG_ALLOC_EAGER( mx_btl, frag, rc );
if( NULL == frag ) {
return NULL;
}
if( (max_data + reserve) <= btl->btl_eager_limit ) {
iov.iov_len = max_data;
} else {
iov.iov_len = mca_btl_mx_module.super.btl_eager_limit - reserve;
max_data = iov.iov_len; /* let the PML establish the pipeline */
}
iov.iov_base = (void*)((unsigned char*)frag->segment[0].seg_addr.pval + reserve);
frag->segment[0].seg_len = reserve;
frag->base.des_src_cnt = 1;
iov.iov_base = (void*)((unsigned char*)frag->segment[0].seg_addr.pval + reserve);
}
frag->segment[0].seg_len = reserve;
iov.iov_len = max_data;
(void)ompi_convertor_pack(convertor, &iov, &iov_count, &max_data );
*size = max_data;
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data );
*size = max_data;
if( rc < 0 ) {
MCA_BTL_MX_FRAG_RETURN( mx_btl, frag );
return NULL;
}
if( 1 == frag->base.des_src_cnt ) {
frag->segment[0].seg_len += max_data;
if( 0 == reserve )
frag->segment[0].seg_addr.pval = iov.iov_base;
} else {
frag->segment[1].seg_addr.pval = iov.iov_base;
frag->segment[1].seg_len = max_data;
frag->segment[1].seg_addr.pval = iov.iov_base;
}
frag->base.des_src = frag->segment;
frag->base.des_dst = NULL;
@ -284,45 +285,6 @@ mca_btl_base_descriptor_t* mca_btl_mx_prepare_src( struct mca_btl_base_module_t*
return &frag->base;
}
/**
* Initiate an asynchronous put.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
static int mca_btl_mx_put( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor )
{
mca_btl_mx_module_t* mx_btl = (mca_btl_mx_module_t*)btl;
mca_btl_mx_frag_t* frag = (mca_btl_mx_frag_t*)descriptor;
mx_segment_t mx_segment;
mx_return_t mx_return;
if( MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status ) {
if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->status )
return OMPI_ERROR;
if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) )
return OMPI_ERROR;
}
frag->endpoint = endpoint;
frag->tag = 0xff;
mx_segment.segment_ptr = descriptor->des_src[1].seg_addr.pval;
mx_segment.segment_length = descriptor->des_src[1].seg_len;
mx_return = mx_isend( mx_btl->mx_endpoint, &mx_segment, 1/*descriptor->des_src_cnt*/,
endpoint->mx_peer_addr,
descriptor->des_dst[0].seg_key.key64, frag, &frag->mx_request );
if( MX_SUCCESS != mx_return ) {
opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) );
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/**
* Prepare a descriptor for send/rdma using the supplied
* convertor. If the convertor references data that is contigous,
@ -379,6 +341,50 @@ mca_btl_base_descriptor_t* mca_btl_mx_prepare_dst( struct mca_btl_base_module_t*
}
/**
* Initiate an asynchronous put.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transferred
*/
static int mca_btl_mx_put( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
struct mca_btl_base_descriptor_t* descriptor )
{
mca_btl_mx_module_t* mx_btl = (mca_btl_mx_module_t*)btl;
mca_btl_mx_frag_t* frag = (mca_btl_mx_frag_t*)descriptor;
mx_segment_t mx_segment[2];
mx_return_t mx_return;
if( MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status ) {
if( MCA_BTL_MX_NOT_REACHEABLE == ((mca_btl_mx_endpoint_t*)endpoint)->status )
return OMPI_ERROR;
if( OMPI_SUCCESS != mca_btl_mx_proc_connect( (mca_btl_mx_endpoint_t*)endpoint ) )
return OMPI_ERROR;
}
frag->endpoint = endpoint;
frag->tag = 0xff;
mx_segment[0].segment_ptr = descriptor->des_src[0].seg_addr.pval;
mx_segment[0].segment_length = descriptor->des_src[0].seg_len;
if( 1 < descriptor->des_src_cnt ) {
mx_segment[1].segment_ptr = descriptor->des_src[1].seg_addr.pval;
mx_segment[1].segment_length = descriptor->des_src[1].seg_len;
}
mx_return = mx_isend( mx_btl->mx_endpoint, mx_segment, descriptor->des_src_cnt,
endpoint->mx_peer_addr,
descriptor->des_dst[0].seg_key.key64, frag, &frag->mx_request );
if( MX_SUCCESS != mx_return ) {
opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) );
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/**
* Initiate an asynchronous send.
*
@ -407,12 +413,13 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl,
return OMPI_ERROR;
}
frag->endpoint = endpoint;
frag->tag = tag;
frag->endpoint = endpoint;
frag->tag = 0xff;
mx_segment[0].segment_ptr = descriptor->des_src[0].seg_addr.pval;
mx_segment[0].segment_length = descriptor->des_src[0].seg_len;
total_length = mx_segment[0].segment_length;
if( 2 == descriptor->des_src_cnt ) {
if( 1 < descriptor->des_src_cnt ) {
mx_segment[1].segment_ptr = descriptor->des_src[1].seg_addr.pval;
mx_segment[1].segment_length = descriptor->des_src[1].seg_len;
total_length += mx_segment[1].segment_length;

Просмотреть файл

@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -46,7 +46,7 @@ extern "C" {
/**
* The mask used for receive and for the PUT protocol
*/
#define BTL_MX_RECV_MASK 0xffffffffffffffffULL
#define BTL_MX_RECV_MASK 0x0000ffffffffffffULL
#define BTL_MX_PUT_MASK 0xffffffffffffffffULL
/**
@ -241,9 +241,8 @@ extern int mca_btl_mx_register(
* @param size (IN) Request segment size.
*/
extern mca_btl_base_descriptor_t* mca_btl_mx_alloc(
struct mca_btl_base_module_t* btl,
size_t size);
mca_btl_base_descriptor_t* mca_btl_mx_alloc( struct mca_btl_base_module_t* btl,
size_t size );
/**
@ -253,9 +252,8 @@ extern mca_btl_base_descriptor_t* mca_btl_mx_alloc(
* @param descriptor (IN) Allocated descriptor.
*/
extern int mca_btl_mx_free(
struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des);
int mca_btl_mx_free( struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des );
/**
@ -270,25 +268,57 @@ extern int mca_btl_mx_free(
* @param convertor (IN) Data type convertor
* @param reserve (IN) Additional bytes requested by upper layer to precede user data
* @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
*/
*/
mca_btl_base_descriptor_t*
mca_btl_mx_prepare_src( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
struct mca_mpool_base_registration_t*,
struct ompi_convertor_t* convertor,
size_t reserve,
size_t* size );
mca_btl_base_descriptor_t* mca_btl_mx_prepare_src(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
struct mca_mpool_base_registration_t*,
struct ompi_convertor_t* convertor,
size_t reserve,
size_t* size
);
extern mca_btl_base_descriptor_t* mca_btl_mx_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
struct mca_mpool_base_registration_t*,
struct ompi_convertor_t* convertor,
size_t reserve,
size_t* size);
mca_btl_base_descriptor_t*
mca_btl_mx_prepare_dst( struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* peer,
struct mca_mpool_base_registration_t*,
struct ompi_convertor_t* convertor,
size_t reserve,
size_t* size );
#define MCA_BTL_MX_PROGRESS(mx_btl, mx_status) \
do { \
mca_btl_mx_frag_t* __frag = mx_status.context; \
mx_segment_t __mx_segment; \
mx_return_t __mx_return; \
\
if( NULL != __frag ) { \
if( 0xff == __frag->tag ) { /* it's a send */ \
/* call the completion callback */ \
__frag->base.des_cbfunc( &(mx_btl->super), __frag->endpoint, \
&(__frag->base), OMPI_SUCCESS ); \
} else { /* and this one is a receive */ \
mca_btl_base_recv_reg_t* __reg; \
\
__reg = &(mx_btl->mx_reg[__frag->tag]); \
__frag->base.des_dst->seg_len = mx_status.msg_length; \
__reg->cbfunc( &(mx_btl->super), __frag->tag, &(__frag->base), \
__reg->cbdata ); \
/** \
* The upper level extract the data from the fragment. \
* Now we can register the fragment \
* again with the MX BTL. \
*/ \
__mx_segment.segment_ptr = __frag->base.des_dst->seg_addr.pval; \
__mx_segment.segment_length = mca_btl_mx_module.super.btl_eager_limit; \
__mx_return = mx_irecv( mx_btl->mx_endpoint, &__mx_segment, 1, \
(uint64_t)__frag->tag, BTL_MX_RECV_MASK, \
__frag, &(__frag->mx_request) ); \
if( MX_SUCCESS != __mx_return ) { \
opal_output( 0, "Fail to re-register a fragment with the MX NIC ...\n" ); \
} \
} \
} \
} while (0)
#if defined(c_plusplus) || defined(__cplusplus)
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -318,17 +318,19 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
NULL, 0);
return NULL;
}
#if 0
/* check for limit on number of btls */
if(mca_btl_mx_component.mx_num_btls > mca_btl_mx_component.mx_max_btls)
mca_btl_mx_component.mx_num_btls = mca_btl_mx_component.mx_max_btls;
#endif
/* Now we know how many NIC are available on the system. We will create a BTL for each one
* and then give a pointer to the BTL to the upper level.
*/
mca_btl_mx_component.mx_btls = malloc( mca_btl_mx_component.mx_num_btls * sizeof(mca_btl_base_module_t*) );
if( NULL == mca_btl_mx_component.mx_btls )
if( NULL == mca_btl_mx_component.mx_btls ) {
opal_output( 0, "MX BTL no memory\n" );
return NULL;
}
/* determine the NIC ids */
size = sizeof(uint64_t) * (mca_btl_mx_component.mx_num_btls + 1);
@ -396,9 +398,7 @@ int mca_btl_mx_component_progress()
int32_t num_progressed = 0, i;
mx_status_t mx_status;
mx_return_t mx_return;
mx_segment_t mx_segment;
mx_request_t mx_request;
mca_btl_mx_frag_t* frag;
for( i = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) {
mca_btl_mx_module_t* mx_btl = mca_btl_mx_component.mx_btls[i];
@ -429,37 +429,12 @@ int mca_btl_mx_component_progress()
mx_return);
continue;
}
frag = mx_status.context;
if( 0 == frag->base.des_dst_cnt ) { /* it's a send */
/* call the completion callback */
frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS);
} else { /* and this one is a receive */
mca_btl_base_recv_reg_t* reg;
reg = &(mx_btl->mx_reg[frag->tag]);
frag->base.des_dst->seg_len = mx_status.msg_length;
reg->cbfunc( &(mx_btl->super), frag->tag, &(frag->base), reg->cbdata );
/*
* The upper level extract the data from the fragment. Now we can register the fragment
* again with the MX BTL.
*/
mx_segment.segment_ptr = frag->base.des_dst->seg_addr.pval;
mx_segment.segment_length = mca_btl_mx_module.super.btl_eager_limit;
mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1, (uint64_t)frag->tag,
BTL_MX_RECV_MASK,
frag, &(frag->mx_request) );
if( MX_SUCCESS != mx_return ) {
opal_output( 0, "Fail to re-register a fragment with the MX NIC ...\n" );
}
}
/*MCA_BTL_MX_PROGRESS(mx_btl, mx_status);*/
/*
* on the mx_status we have now the pointer attached to the request. This pointer indicate
* which fragment we are working on. On the status we have the status of the operation, so
* we know what we are supposed to do next.
/* on the mx_status we have now the pointer attached to the request.
* This pointer indicate which fragment we are working on. On the
* status we have the status of the operation, so we know what we
* are supposed to do next.
*/
MCA_BTL_MX_PROGRESS(mx_btl, mx_status);
num_progressed++;
}
return num_progressed;

Просмотреть файл

@ -68,25 +68,17 @@ extern "C" {
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT( &mca_btl_mx_component.mx_send_eager_frags, item, rc); \
frag = (mca_btl_mx_frag_t*) item; \
frag->mx_frag_list = (ompi_free_list_t*)&(mca_btl_mx_component.mx_send_eager_frags); \
frag->mx_frag_list = &(mca_btl_mx_component.mx_send_eager_frags); \
frag->segment[0].seg_addr.pval = (void*)(frag+1); \
frag->segment[0].seg_len = mca_btl_mx_module.super.btl_eager_limit; \
}
#if 0
#define MCA_BTL_MX_FRAG_ALLOC_USER(btl, frag, rc) \
{ \
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT( &mca_btl_mx_component.mx_send_user_frags, item, rc); \
frag = (mca_btl_mx_frag_t*) item; \
frag->mx_frag_list = (ompi_free_list_t*)&(mca_btl_mx_component.mx_send_user_frags); \
frag->segment[0].seg_addr.pval = (void*)(frag+1); \
frag->segment[0].seg_len = mca_btl_mx_module.super.btl_eager_limit; \
frag->mx_frag_list = &(mca_btl_mx_component.mx_send_user_frags); \
}
#else
#define MCA_BTL_MX_FRAG_ALLOC_USER(btl, frag, rc) \
MCA_BTL_MX_FRAG_ALLOC_EAGER( btl, frag, rc )
#endif
#define MCA_BTL_MX_FRAG_RETURN(btl, frag) \
{ \