1
1

latency improvements: use ompi_free_list_init_ex, create optimal alignment parameter, remove rdma guarantee path, replace dat_lmt_sync_rdma with use of volatile

This commit was SVN r14634.
Этот коммит содержится в:
Donald Kerr 2007-05-09 19:41:25 +00:00
родитель b9195145e9
Коммит 436d370d51
7 изменённых файлов: 91 добавлений и 191 удалений

Просмотреть файл

@ -269,41 +269,57 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
OBJ_CONSTRUCT(&btl->udapl_frag_control, ompi_free_list_t);
OBJ_CONSTRUCT(&btl->udapl_lock, opal_mutex_t);
/* check buffer alignment against dat library */
if (mca_btl_udapl_component.udapl_buffer_alignment !=
DAT_OPTIMAL_ALIGNMENT) {
opal_show_help("help-mpi-btl-udapl.txt",
"optimal buffer alignment mismatch",
true,
DAT_OPTIMAL_ALIGNMENT,
mca_btl_udapl_component.udapl_buffer_alignment,
DAT_OPTIMAL_ALIGNMENT);
}
/* initialize free lists */
ompi_free_list_init(&btl->udapl_frag_eager,
sizeof(mca_btl_udapl_frag_eager_t) +
mca_btl_udapl_component.udapl_eager_frag_size,
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
ompi_free_list_init_ex(&btl->udapl_frag_eager,
sizeof(mca_btl_udapl_frag_eager_t) +
mca_btl_udapl_component.udapl_eager_frag_size,
mca_btl_udapl_component.udapl_buffer_alignment,
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
ompi_free_list_init(&btl->udapl_frag_max,
sizeof(mca_btl_udapl_frag_max_t) +
mca_btl_udapl_component.udapl_max_frag_size,
OBJ_CLASS(mca_btl_udapl_frag_max_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
ompi_free_list_init_ex(&btl->udapl_frag_max,
sizeof(mca_btl_udapl_frag_max_t) +
mca_btl_udapl_component.udapl_max_frag_size,
mca_btl_udapl_component.udapl_buffer_alignment,
OBJ_CLASS(mca_btl_udapl_frag_max_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
ompi_free_list_init(&btl->udapl_frag_user,
sizeof(mca_btl_udapl_frag_user_t),
OBJ_CLASS(mca_btl_udapl_frag_user_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
NULL);
ompi_free_list_init_ex(&btl->udapl_frag_user,
sizeof(mca_btl_udapl_frag_user_t),
mca_btl_udapl_component.udapl_buffer_alignment,
OBJ_CLASS(mca_btl_udapl_frag_user_t),
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
NULL);
ompi_free_list_init(&btl->udapl_frag_control,
sizeof(mca_btl_udapl_frag_eager_t) +
mca_btl_udapl_component.udapl_eager_frag_size,
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
mca_btl_udapl_component.udapl_free_list_num,
-1,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
ompi_free_list_init_ex(&btl->udapl_frag_control,
sizeof(mca_btl_udapl_frag_eager_t) +
mca_btl_udapl_component.udapl_eager_frag_size,
mca_btl_udapl_component.udapl_buffer_alignment,
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
mca_btl_udapl_component.udapl_free_list_num,
-1,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
/* initialize eager rdma buffer info */
orte_pointer_array_init(&btl->udapl_eager_rdma_endpoints,

Просмотреть файл

@ -62,16 +62,6 @@ struct mca_btl_udapl_component_t {
int32_t udapl_sr_win; /**< number of fragments recieved before
returnting credits to sendier */
int32_t udapl_timeout; /**< connection timeout, in microseconds */
int32_t udapl_eager_rdma_guarantee;/**< uDAPL does not guarantee
the order of data written to
buffer, if the interface
card in use guarantees front
to back order of data
written then this flag
should remain as set by
default (off) otherwise
latency overhead will
increase if turned on */
size_t udapl_eager_frag_size;
size_t udapl_max_frag_size;
size_t udapl_eager_rdma_frag_size; /* size of the rdma fragement including data
@ -90,6 +80,7 @@ struct mca_btl_udapl_component_t {
recieved before returning credits to
sender */
int32_t udapl_async_events; /**< dequeue asynchronous events */
int32_t udapl_buffer_alignment; /**< preferred communication buffer alignment, in bytes */
opal_list_t udapl_procs; /**< list of udapl proc structures */
opal_mutex_t udapl_lock; /**< lock for accessing module state */
char* udapl_mpool_name; /**< name of memory pool */

Просмотреть файл

@ -618,12 +618,6 @@ int mca_btl_udapl_component_progress()
dto = &event.event_data.dto_completion_event_data;
frag = dto->user_cookie.as_ptr;
/* if we are using the "guarantee" rdma code path
* the extra write sets cookie to NULL, when this
* happens we ignore it because the completion
* write event is coming
*/
if (frag == NULL) break;
/* Was the DTO successful? */
if(DAT_DTO_SUCCESS != dto->status) {
@ -880,7 +874,6 @@ int mca_btl_udapl_component_progress()
for (j = 0; j < rdma_ep_count; j++) {
mca_btl_udapl_endpoint_t* endpoint;
mca_btl_udapl_frag_t *local_rdma_frag;
DAT_LMR_TRIPLET local_rdma_segment;
endpoint =
orte_pointer_array_get_item(btl->udapl_eager_rdma_endpoints, j);
@ -891,19 +884,6 @@ int mca_btl_udapl_component_progress()
MCA_BTL_UDAPL_GET_LOCAL_RDMA_FRAG(endpoint,
endpoint->endpoint_eager_rdma_local.head);
/* sync local memory before checking if active
* Question, will narrowing sync area to just the active byte
* one, work and two, improve performance
*/
local_rdma_segment.lmr_context =
local_rdma_frag->triplet.lmr_context;
local_rdma_segment.virtual_address =
(DAT_VADDR)local_rdma_frag->segment.seg_addr.pval;
local_rdma_segment.segment_length = local_rdma_frag->size;
dat_lmr_sync_rdma_write(endpoint->endpoint_btl->udapl_ia,
&local_rdma_segment, 1);
if (local_rdma_frag->rdma_ftr->active == 1) {
int pad = 0;
mca_btl_base_recv_reg_t* reg;

Просмотреть файл

@ -77,7 +77,6 @@ int mca_btl_udapl_endpoint_write_eager(mca_btl_base_endpoint_t* endpoint,
DAT_DTO_COOKIE cookie;
char* remote_buf;
DAT_RMR_TRIPLET remote_buffer;
DAT_LMR_TRIPLET local_iov; /* one contiguous write */
int rc = OMPI_SUCCESS;
int pad = 0;
uint8_t head = endpoint->endpoint_eager_rdma_remote.head;
@ -121,127 +120,32 @@ int mca_btl_udapl_endpoint_write_eager(mca_btl_base_endpoint_t* endpoint,
frag->size -
frag->triplet.segment_length;
if (mca_btl_udapl_component.udapl_eager_rdma_guarantee == 0) {
/* execute transfer with one contiguous write */
/* execute transfer with one contiguous write */
/* establish remote memory region */
remote_buffer.rmr_context =
(DAT_RMR_CONTEXT)endpoint->endpoint_eager_rdma_remote.rkey;
remote_buffer.target_address = (DAT_VADDR)remote_buf;
remote_buffer.segment_length = frag->triplet.segment_length;
/* establish remote memory region */
remote_buffer.rmr_context =
(DAT_RMR_CONTEXT)endpoint->endpoint_eager_rdma_remote.rkey;
remote_buffer.target_address = (DAT_VADDR)remote_buf;
remote_buffer.segment_length = frag->triplet.segment_length;
/* write the data out */
cookie.as_ptr = frag;
rc = dat_ep_post_rdma_write(endpoint->endpoint_eager,
1,
&(frag->triplet),
cookie,
&remote_buffer,
DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
char* major;
char* minor;
/* write the data out */
cookie.as_ptr = frag;
rc = dat_ep_post_rdma_write(endpoint->endpoint_eager,
1,
&(frag->triplet),
cookie,
&remote_buffer,
DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
char* major;
char* minor;
dat_strerror(rc, (const char**)&major, (const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
major, minor));
return OMPI_ERROR;
}
} else {
/* One must perform a few extra steps to guarantee that the last
* byte written is indeed the "active" value; This is
* acomplished by doing write-read-write; See Sections
* 6.6.21.0.1 and 6.8.2.1 in Verion 1.2 9/15/2004 of the UDAPL Spec.
*
* Since the frag->triplet is already prep'ed for the non
* guarantee single write case above, here we perform 2 writes:
* first the data and the udapl footer, skipping the pad,
* and then writing just the rdma footer. With the read
* inbetween as required to guarantee delivery of the
* second write after the first.
*/
/* establish remote memory region for data and udapl footer */
remote_buffer.rmr_context =
(DAT_RMR_CONTEXT)endpoint->endpoint_eager_rdma_remote.rkey;
remote_buffer.target_address = (DAT_VADDR)remote_buf;
remote_buffer.segment_length = (frag->triplet.segment_length -
sizeof(mca_btl_udapl_rdma_footer_t) - pad);
/* establish local memory region for data and udapl footer */
local_iov.lmr_context = frag->triplet.lmr_context;
local_iov.virtual_address = (DAT_VADDR)frag->triplet.virtual_address;
local_iov.segment_length = (frag->triplet.segment_length -
sizeof(mca_btl_udapl_rdma_footer_t) - pad);
/* write the data */
cookie.as_ptr = NULL;
rc = dat_ep_post_rdma_write(endpoint->endpoint_eager,
1,
&local_iov,
cookie,
&remote_buffer,
DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
char* major;
char* minor;
dat_strerror(rc, (const char**)&major, (const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
major, minor));
return OMPI_ERROR;
}
dat_strerror(rc, (const char**)&major, (const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
major, minor));
return OMPI_ERROR;
}
/* perform zero byte read of the remote memory region */
remote_buffer.target_address = (DAT_VADDR)remote_buf;
remote_buffer.segment_length = frag->triplet.segment_length;
local_iov.virtual_address = (DAT_VADDR)NULL;
local_iov.segment_length = 0;
cookie.as_ptr = NULL;
rc = dat_ep_post_rdma_read(endpoint->endpoint_eager,
0,
&local_iov,
cookie,
&remote_buffer,
DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
char* major;
char* minor;
dat_strerror(rc, (const char**)&major, (const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_read",
major, minor));
return OMPI_ERROR;
}
/* establish remote memory region for rdma footer */
remote_buffer.target_address = (DAT_VADDR)((char *)remote_buf +
frag->triplet.segment_length - sizeof(mca_btl_udapl_rdma_footer_t));
remote_buffer.segment_length = sizeof(mca_btl_udapl_rdma_footer_t);
/* establish local memory region for rdma footer */
local_iov.virtual_address = (DAT_VADDR)(frag->rdma_ftr);
local_iov.segment_length = sizeof(mca_btl_udapl_rdma_footer_t);
/* write the footer */
cookie.as_ptr = frag;
rc = dat_ep_post_rdma_write(endpoint->endpoint_eager,
1,
&local_iov,
cookie,
&remote_buffer,
DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
char* major;
char* minor;
dat_strerror(rc, (const char**)&major, (const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
major, minor));
return OMPI_ERROR;
}
}
return rc;
}

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
@ -73,9 +73,10 @@ typedef struct mca_btl_udapl_footer_t mca_btl_udapl_footer_t;
*/
struct mca_btl_udapl_rdma_footer_t {
uint32_t size;
uint8_t active; /* 0 = not in use; 1 = data is available to be received;
* this should always be the last entry in this structure
*/
volatile uint8_t active;/* 0 = not in use; 1 = data is available to be
* received; this should always be the last entry
* in this structure
*/
char pad[3]; /* pad out be aligned on MCA_BTL_UDAPL_FRAG_ALIGN byte boundary */
};
typedef struct mca_btl_udapl_rdma_footer_t mca_btl_udapl_rdma_footer_t;

Просмотреть файл

@ -200,14 +200,6 @@ int mca_btl_udapl_register_mca_params(void)
REGINT_GE_ONE), tmp_rc, rc);
mca_btl_udapl_component.udapl_timeout = (uint32_t) ival;
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("eager_rdma_guarantee",
"If the interface card in use guarantees front to back order "
"of data written then this flag should remain as set by "
"default (off).",
0,
&mca_btl_udapl_component.udapl_eager_rdma_guarantee,
REGINT_GE_ZERO), tmp_rc, rc);
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("async_events",
"The asynchronous event queue will only be "
"checked after entering progress this number of times.",
@ -215,6 +207,13 @@ int mca_btl_udapl_register_mca_params(void)
&mca_btl_udapl_component.udapl_async_events,
REGINT_GE_ONE), tmp_rc, rc);
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("buffer_alignment",
"Preferred communication buffer alignment, "
"in bytes (must be >= 1).",
DAT_OPTIMAL_ALIGNMENT,
&mca_btl_udapl_component.udapl_buffer_alignment,
REGINT_GE_ONE), tmp_rc, rc);
/* register uDAPL module parameters */
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("evd_qlen",
"The event dispatcher queue length.",

Просмотреть файл

@ -30,3 +30,12 @@ value, e.g. 16.
WARNING: Using default uDAPL endpoint parameters not those that
would have been modified by MCA parameters.
[optimal buffer alignment mismatch]
WARNING: DAT_OPTIMAL_ALIGNMENT = %d : BTL buffer_alignment = %d.
The BTL buffer_alignment value may not be optimal. If all nodes
report the same DAT_OPTIMAL_ALIGNMENT value and this differs from
BTL buffer_alignment then setting "--mca btl_udapl_buffer_alignment
%d" may improve performance.