1
1

Improve the performance of the MX BTL. Correct the fake PUT

protocol.

This commit was SVN r17452.
Этот коммит содержится в:
George Bosilca 2008-02-14 04:38:55 +00:00
родитель e7aaf6aa67
Коммит 255cd2186b
5 изменённых файлов: 105 добавлений и 90 удалений

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University * Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -136,7 +136,8 @@ int mca_btl_mx_register( struct mca_btl_base_module_t* btl,
mx_segment.segment_ptr = (void*)(frag+1); mx_segment.segment_ptr = (void*)(frag+1);
mx_segment.segment_length = mx_btl->super.btl_eager_limit; mx_segment.segment_length = mx_btl->super.btl_eager_limit;
mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1, 0x0ULL, 0x0ULL, mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1,
0x01ULL, BTL_MX_RECV_MASK,
frag, &(frag->mx_request) ); frag, &(frag->mx_request) );
if( MX_SUCCESS != mx_return ) { if( MX_SUCCESS != mx_return ) {
opal_output( 0, "mca_btl_mx_register: mx_irecv failed with status %d (%s)\n", opal_output( 0, "mca_btl_mx_register: mx_irecv failed with status %d (%s)\n",
@ -317,7 +318,8 @@ mca_btl_base_descriptor_t* mca_btl_mx_prepare_dst( struct mca_btl_base_module_t*
mx_segment.segment_ptr = frag->segment[0].seg_addr.pval; mx_segment.segment_ptr = frag->segment[0].seg_addr.pval;
mx_segment.segment_length = frag->segment[0].seg_len; mx_segment.segment_length = frag->segment[0].seg_len;
mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1, frag->segment[0].seg_key.key64, mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1,
frag->segment[0].seg_key.key64,
BTL_MX_PUT_MASK, NULL, &(frag->mx_request) ); BTL_MX_PUT_MASK, NULL, &(frag->mx_request) );
if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) {
opal_output( 0, "Fail to re-register a fragment with the MX NIC ...\n" ); opal_output( 0, "Fail to re-register a fragment with the MX NIC ...\n" );
@ -325,6 +327,17 @@ mca_btl_base_descriptor_t* mca_btl_mx_prepare_dst( struct mca_btl_base_module_t*
return NULL; return NULL;
} }
#ifdef HAVE_MX_FORGET
{
mx_return = mx_forget( mx_btl->mx_endpoint, &(frag->mx_request) );
if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) {
opal_output( 0, "mx_forget failed in mca_btl_mx_prepare_dst with error %d (%s)\n",
mx_return, mx_strerror(mx_return) );
return NULL;
}
}
#endif
/* Allow the fragment to be recycled using the mca_btl_mx_free function */ /* Allow the fragment to be recycled using the mca_btl_mx_free function */
frag->type = MCA_BTL_MX_SEND; frag->type = MCA_BTL_MX_SEND;
@ -372,7 +385,8 @@ static int mca_btl_mx_put( struct mca_btl_base_module_t* btl,
mx_return = mx_isend( mx_btl->mx_endpoint, mx_segment, descriptor->des_src_cnt, mx_return = mx_isend( mx_btl->mx_endpoint, mx_segment, descriptor->des_src_cnt,
endpoint->mx_peer_addr, endpoint->mx_peer_addr,
descriptor->des_dst[0].seg_key.key64, frag, &frag->mx_request ); descriptor->des_dst[0].seg_key.key64, frag,
&frag->mx_request );
if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) {
opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) ); opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) );
return OMPI_ERROR; return OMPI_ERROR;
@ -400,7 +414,7 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl,
mca_btl_mx_frag_t* frag = (mca_btl_mx_frag_t*)descriptor; mca_btl_mx_frag_t* frag = (mca_btl_mx_frag_t*)descriptor;
mx_segment_t mx_segment[2]; mx_segment_t mx_segment[2];
mx_return_t mx_return; mx_return_t mx_return;
uint64_t total_length = 0; uint64_t total_length = 0, tag64;
uint32_t i = 0; uint32_t i = 0;
if( OPAL_UNLIKELY(MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status) ) { if( OPAL_UNLIKELY(MCA_BTL_MX_CONNECTED != ((mca_btl_mx_endpoint_t*)endpoint)->status) ) {
@ -421,8 +435,10 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl,
total_length += descriptor->des_src[i].seg_len; total_length += descriptor->des_src[i].seg_len;
} while (++i < descriptor->des_src_cnt); } while (++i < descriptor->des_src_cnt);
mx_return = mx_isend( mx_btl->mx_endpoint, mx_segment, descriptor->des_src_cnt, endpoint->mx_peer_addr, tag64 = 0x01ULL | (((uint64_t)tag) << 8);
(uint64_t)tag, frag, &frag->mx_request ); mx_return = mx_isend( mx_btl->mx_endpoint, mx_segment, descriptor->des_src_cnt,
endpoint->mx_peer_addr,
tag64, frag, &frag->mx_request );
if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) {
opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) ); opal_output( 0, "mx_isend fails with error %s\n", mx_strerror(mx_return) );
return OMPI_ERROR; return OMPI_ERROR;

Просмотреть файл

@ -42,14 +42,12 @@
#include <mx_extensions.h> #include <mx_extensions.h>
#endif /* HAVE_MX_EXTENSIONS_H */ #endif /* HAVE_MX_EXTENSIONS_H */
#if defined(c_plusplus) || defined(__cplusplus) BEGIN_C_DECLS
extern "C" {
#endif
/** /**
* The mask used for receive and for the PUT protocol * The mask used for receive and for the PUT protocol
*/ */
#define BTL_MX_RECV_MASK 0x0000ffffffffffffULL #define BTL_MX_RECV_MASK 0x00000000000000ffULL
#define BTL_MX_PUT_MASK 0xffffffffffffffffULL #define BTL_MX_PUT_MASK 0xffffffffffffffffULL
/** /**
@ -310,7 +308,6 @@ mca_btl_mx_prepare_dst( struct mca_btl_base_module_t* btl,
*/ */
int mca_btl_mx_ft_event(int state); int mca_btl_mx_ft_event(int state);
#if defined(c_plusplus) || defined(__cplusplus) END_C_DECLS
}
#endif
#endif #endif

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University * Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -143,9 +143,9 @@ int mca_btl_mx_component_open(void)
false, false, NULL, &mca_btl_mx_component.mx_if_exclude ); false, false, NULL, &mca_btl_mx_component.mx_if_exclude );
mca_btl_mx_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT; mca_btl_mx_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
mca_btl_mx_module.super.btl_eager_limit = 4096; mca_btl_mx_module.super.btl_eager_limit = 1024;
mca_btl_mx_module.super.btl_rndv_eager_limit = 4096; mca_btl_mx_module.super.btl_rndv_eager_limit = 1024;
mca_btl_mx_module.super.btl_max_send_size = 64*1024; mca_btl_mx_module.super.btl_max_send_size = 8*1024;
mca_btl_mx_module.super.btl_rdma_pipeline_send_length = 256*1024; mca_btl_mx_module.super.btl_rdma_pipeline_send_length = 256*1024;
mca_btl_mx_module.super.btl_rdma_pipeline_frag_size = 8*1024*1024; mca_btl_mx_module.super.btl_rdma_pipeline_frag_size = 8*1024*1024;
mca_btl_mx_module.super.btl_min_rdma_pipeline_size = 0; mca_btl_mx_module.super.btl_min_rdma_pipeline_size = 0;
@ -215,11 +215,10 @@ mca_btl_mx_unexpected_handler( void *context, mx_endpoint_addr_t source,
/*opal_output( 0, "Get unexpected handler context %p source %lld match_value %lld\n" /*opal_output( 0, "Get unexpected handler context %p source %lld match_value %lld\n"
"\tlength %d data %p\n", context, source.stuff[0], match_value, length, "\tlength %d data %p\n", context, source.stuff[0], match_value, length,
data_if_available );*/ data_if_available );*/
if( match_value > MCA_BTL_TAG_MAX ) if( !(0x01 & match_value) )
return MX_RECV_CONTINUE; return MX_RECV_CONTINUE;
tag = match_value & 0xff; tag = (match_value >> 8) & 0xff;
assert( tag < 16 );
reg = mca_btl_base_active_message_trigger + tag; reg = mca_btl_base_active_message_trigger + tag;
segment.seg_addr.pval = data_if_available; segment.seg_addr.pval = data_if_available;
@ -437,11 +436,14 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
return NULL; return NULL;
} }
/* set the MX error handle to always return. This function is the only MX function /**
* allowed to be called before mx_init in order to make sure that if the MX is not * As the MX MTL get initialized before the MX BTL it will call the
* up and running the MX library does not exit the application. * mx_init and the environment variables set by the BTL will be useless.
* Closing the MX will force the next call to mx_init to take these
* environment variables into account.
*/ */
mx_set_error_handler(MX_ERRORS_RETURN); /*(void)ompi_common_mx_finalize();*/
if( 0 == mca_btl_mx_component.mx_support_sharedmem ) if( 0 == mca_btl_mx_component.mx_support_sharedmem )
opal_setenv( "MX_DISABLE_SHMEM", "1", true, &environ ); opal_setenv( "MX_DISABLE_SHMEM", "1", true, &environ );
if( 0 == mca_btl_mx_component.mx_support_self ) if( 0 == mca_btl_mx_component.mx_support_self )
@ -449,6 +451,12 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
/* Force the long pipeline (up to 4Kb fragments) */ /* Force the long pipeline (up to 4Kb fragments) */
opal_setenv( "MX_PIPELINE_LOG", "0", true, &environ ); opal_setenv( "MX_PIPELINE_LOG", "0", true, &environ );
/* set the MX error handle to always return. This function is the only MX function
* allowed to be called before mx_init in order to make sure that if the MX is not
* up and running the MX library does not exit the application.
*/
mx_set_error_handler(MX_ERRORS_RETURN);
/* First check if MX is available ... */ /* First check if MX is available ... */
if( OMPI_SUCCESS != ompi_common_mx_initialize() ) { if( OMPI_SUCCESS != ompi_common_mx_initialize() ) {
ompi_modex_send(&mca_btl_mx_component.super.btl_version, ompi_modex_send(&mca_btl_mx_component.super.btl_version,
@ -618,14 +626,14 @@ int mca_btl_mx_component_progress(void)
*/ */
frag = mx_status.context; frag = mx_status.context;
if( NULL != frag ) { if( NULL != frag ) {
if( 0xff == frag->type ) { /* it's a send */ if( MCA_BTL_MX_SEND == frag->type ) { /* it's a send */
/* call the completion callback */ /* call the completion callback */
frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint,
&(frag->base), OMPI_SUCCESS ); &(frag->base), OMPI_SUCCESS );
} else if( !mca_btl_mx_component.mx_use_unexpected ) { /* and this one is a receive */ } else if( !mca_btl_mx_component.mx_use_unexpected ) { /* and this one is a receive */
mca_btl_active_message_callback_t* reg; mca_btl_active_message_callback_t* reg;
mx_segment_t mx_segment; mx_segment_t mx_segment;
uint8_t tag = mx_status.match_info & 0xff; uint8_t tag = (mx_status.match_info >> 8) & 0xff;
reg = mca_btl_base_active_message_trigger + tag; reg = mca_btl_base_active_message_trigger + tag;
frag->base.des_dst->seg_len = mx_status.msg_length; frag->base.des_dst->seg_len = mx_status.msg_length;
@ -638,7 +646,7 @@ int mca_btl_mx_component_progress(void)
mx_segment.segment_ptr = frag->base.des_dst->seg_addr.pval; mx_segment.segment_ptr = frag->base.des_dst->seg_addr.pval;
mx_segment.segment_length = mca_btl_mx_module.super.btl_eager_limit; mx_segment.segment_length = mca_btl_mx_module.super.btl_eager_limit;
mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1, mx_return = mx_irecv( mx_btl->mx_endpoint, &mx_segment, 1,
0x0ULL, 0x0ULL, 0x01ULL, BTL_MX_RECV_MASK,
frag, &(frag->mx_request) ); frag, &(frag->mx_request) );
if( MX_SUCCESS != mx_return ) { if( MX_SUCCESS != mx_return ) {
opal_output( 0, "Fail to re-register a fragment with the MX NIC ... (%s)\n", opal_output( 0, "Fail to re-register a fragment with the MX NIC ... (%s)\n",

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology * University Research and Technology
* Corporation. All rights reserved. * Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University * Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights * of Tennessee Research Foundation. All rights
* reserved. * reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -20,24 +20,20 @@
#define MCA_BTL_MX_FRAG_H #define MCA_BTL_MX_FRAG_H
#define MCA_BTL_MX_FRAG_ALIGN (8)
#include "ompi_config.h" #include "ompi_config.h"
#include "opal/class/opal_list.h" #include "opal/class/opal_list.h"
#include "ompi/class/ompi_free_list.h" #include "ompi/class/ompi_free_list.h"
#include "btl_mx.h" #include "btl_mx.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
#define MCA_BTL_MX_SEND 0x01 #define MCA_BTL_MX_SEND 0x01
#define MCA_BTL_MX_RECV 0x02 #define MCA_BTL_MX_RECV 0x02
/** BEGIN_C_DECLS
/**
* MX send framxent derived type. * MX send framxent derived type.
*/ */
struct mca_btl_mx_frag_t { struct mca_btl_mx_frag_t {
mca_btl_base_descriptor_t base; mca_btl_base_descriptor_t base;
mca_btl_base_segment_t segment[2]; mca_btl_base_segment_t segment[2];
struct mca_btl_base_endpoint_t* endpoint; struct mca_btl_base_endpoint_t* endpoint;
@ -45,51 +41,49 @@ extern "C" {
mx_request_t mx_request; mx_request_t mx_request;
size_t size; size_t size;
ompi_free_list_t* mx_frag_list; ompi_free_list_t* mx_frag_list;
}; };
typedef struct mca_btl_mx_frag_t mca_btl_mx_frag_t; typedef struct mca_btl_mx_frag_t mca_btl_mx_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_mx_frag_t); OBJ_CLASS_DECLARATION(mca_btl_mx_frag_t);
typedef struct mca_btl_mx_frag_t mca_btl_mx_frag_eager_t; typedef struct mca_btl_mx_frag_t mca_btl_mx_frag_eager_t;
OBJ_CLASS_DECLARATION(mca_btl_mx_frag_eager_t); OBJ_CLASS_DECLARATION(mca_btl_mx_frag_eager_t);
typedef struct mca_btl_mx_frag_t mca_btl_mx_frag_max_t; typedef struct mca_btl_mx_frag_t mca_btl_mx_frag_max_t;
OBJ_CLASS_DECLARATION(mca_btl_mx_frag_max_t); OBJ_CLASS_DECLARATION(mca_btl_mx_frag_max_t);
typedef struct mca_btl_mx_frag_t mca_btl_mx_frag_user_t; typedef struct mca_btl_mx_frag_t mca_btl_mx_frag_user_t;
OBJ_CLASS_DECLARATION(mca_btl_mx_frag_user_t); OBJ_CLASS_DECLARATION(mca_btl_mx_frag_user_t);
/* /*
* Macros to allocate/return descriptors from module specific * Macros to allocate/return descriptors from module specific
* free list(s). * free list(s).
*/ */
#define MCA_BTL_MX_FRAG_ALLOC_EAGER(btl, frag, rc) \ #define MCA_BTL_MX_FRAG_ALLOC_EAGER(btl, frag, rc) \
{ \ { \
ompi_free_list_item_t *item; \ ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT( &mca_btl_mx_component.mx_send_eager_frags, item, rc); \ OMPI_FREE_LIST_WAIT( &mca_btl_mx_component.mx_send_eager_frags, item, rc); \
frag = (mca_btl_mx_frag_t*) item; \ frag = (mca_btl_mx_frag_t*) item; \
frag->mx_frag_list = &(mca_btl_mx_component.mx_send_eager_frags); \ frag->mx_frag_list = &(mca_btl_mx_component.mx_send_eager_frags); \
frag->segment[0].seg_addr.pval = (void*)(frag+1); \ frag->segment[0].seg_addr.pval = (void*)(frag+1); \
} }
#define MCA_BTL_MX_FRAG_ALLOC_USER(btl, frag, rc) \ #define MCA_BTL_MX_FRAG_ALLOC_USER(btl, frag, rc) \
{ \ { \
ompi_free_list_item_t *item; \ ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT( &mca_btl_mx_component.mx_send_user_frags, item, rc); \ OMPI_FREE_LIST_WAIT( &mca_btl_mx_component.mx_send_user_frags, item, rc); \
frag = (mca_btl_mx_frag_t*) item; \ frag = (mca_btl_mx_frag_t*) item; \
frag->mx_frag_list = &(mca_btl_mx_component.mx_send_user_frags); \ frag->mx_frag_list = &(mca_btl_mx_component.mx_send_user_frags); \
} }
#define MCA_BTL_MX_FRAG_RETURN(btl, frag) \ #define MCA_BTL_MX_FRAG_RETURN(btl, frag) \
{ \ { \
/*opal_output( 0, "return item to %p\n", frag->mx_frag_list );*/ \
OMPI_FREE_LIST_RETURN( frag->mx_frag_list, (ompi_free_list_item_t*)(frag)); \ OMPI_FREE_LIST_RETURN( frag->mx_frag_list, (ompi_free_list_item_t*)(frag)); \
} }
END_C_DECLS
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif #endif

Просмотреть файл

@ -106,7 +106,7 @@ mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
{ {
mca_btl_mx_proc_t* module_proc = NULL; mca_btl_mx_proc_t* module_proc = NULL;
mca_btl_mx_addr_t *mx_peers; mca_btl_mx_addr_t *mx_peers;
int rc, i; int rc;
size_t size; size_t size;
/* Check if we have already created a MX proc /* Check if we have already created a MX proc
@ -142,8 +142,8 @@ mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
module_proc->mx_peers_count = size / sizeof(mca_btl_mx_addr_t); module_proc->mx_peers_count = size / sizeof(mca_btl_mx_addr_t);
#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
for (i = 0 ; i < module_proc->mx_peers_count ; ++i) { for (rc = 0 ; rc < module_proc->mx_peers_count ; ++rc) {
BTL_MX_ADDR_NTOH(mx_peers[i]); BTL_MX_ADDR_NTOH(mx_peers[rc]);
} }
#endif #endif
module_proc->mx_peers = mx_peers; module_proc->mx_peers = mx_peers;