latency improvements: use ompi_free_list_init_ex, create optimal alignment parameter, remove rdma guarantee path, replace dat_lmt_sync_rdma with use of volatile
This commit was SVN r14634.
Этот коммит содержится в:
родитель
b9195145e9
Коммит
436d370d51
@ -269,41 +269,57 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
|
||||
OBJ_CONSTRUCT(&btl->udapl_frag_control, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&btl->udapl_lock, opal_mutex_t);
|
||||
|
||||
/* check buffer alignment against dat library */
|
||||
if (mca_btl_udapl_component.udapl_buffer_alignment !=
|
||||
DAT_OPTIMAL_ALIGNMENT) {
|
||||
|
||||
opal_show_help("help-mpi-btl-udapl.txt",
|
||||
"optimal buffer alignment mismatch",
|
||||
true,
|
||||
DAT_OPTIMAL_ALIGNMENT,
|
||||
mca_btl_udapl_component.udapl_buffer_alignment,
|
||||
DAT_OPTIMAL_ALIGNMENT);
|
||||
}
|
||||
|
||||
/* initialize free lists */
|
||||
ompi_free_list_init(&btl->udapl_frag_eager,
|
||||
sizeof(mca_btl_udapl_frag_eager_t) +
|
||||
mca_btl_udapl_component.udapl_eager_frag_size,
|
||||
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
|
||||
mca_btl_udapl_component.udapl_free_list_num,
|
||||
mca_btl_udapl_component.udapl_free_list_max,
|
||||
mca_btl_udapl_component.udapl_free_list_inc,
|
||||
btl->super.btl_mpool);
|
||||
ompi_free_list_init_ex(&btl->udapl_frag_eager,
|
||||
sizeof(mca_btl_udapl_frag_eager_t) +
|
||||
mca_btl_udapl_component.udapl_eager_frag_size,
|
||||
mca_btl_udapl_component.udapl_buffer_alignment,
|
||||
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
|
||||
mca_btl_udapl_component.udapl_free_list_num,
|
||||
mca_btl_udapl_component.udapl_free_list_max,
|
||||
mca_btl_udapl_component.udapl_free_list_inc,
|
||||
btl->super.btl_mpool);
|
||||
|
||||
ompi_free_list_init(&btl->udapl_frag_max,
|
||||
sizeof(mca_btl_udapl_frag_max_t) +
|
||||
mca_btl_udapl_component.udapl_max_frag_size,
|
||||
OBJ_CLASS(mca_btl_udapl_frag_max_t),
|
||||
mca_btl_udapl_component.udapl_free_list_num,
|
||||
mca_btl_udapl_component.udapl_free_list_max,
|
||||
mca_btl_udapl_component.udapl_free_list_inc,
|
||||
btl->super.btl_mpool);
|
||||
ompi_free_list_init_ex(&btl->udapl_frag_max,
|
||||
sizeof(mca_btl_udapl_frag_max_t) +
|
||||
mca_btl_udapl_component.udapl_max_frag_size,
|
||||
mca_btl_udapl_component.udapl_buffer_alignment,
|
||||
OBJ_CLASS(mca_btl_udapl_frag_max_t),
|
||||
mca_btl_udapl_component.udapl_free_list_num,
|
||||
mca_btl_udapl_component.udapl_free_list_max,
|
||||
mca_btl_udapl_component.udapl_free_list_inc,
|
||||
btl->super.btl_mpool);
|
||||
|
||||
ompi_free_list_init(&btl->udapl_frag_user,
|
||||
sizeof(mca_btl_udapl_frag_user_t),
|
||||
OBJ_CLASS(mca_btl_udapl_frag_user_t),
|
||||
mca_btl_udapl_component.udapl_free_list_num,
|
||||
mca_btl_udapl_component.udapl_free_list_max,
|
||||
mca_btl_udapl_component.udapl_free_list_inc,
|
||||
NULL);
|
||||
ompi_free_list_init_ex(&btl->udapl_frag_user,
|
||||
sizeof(mca_btl_udapl_frag_user_t),
|
||||
mca_btl_udapl_component.udapl_buffer_alignment,
|
||||
OBJ_CLASS(mca_btl_udapl_frag_user_t),
|
||||
mca_btl_udapl_component.udapl_free_list_num,
|
||||
mca_btl_udapl_component.udapl_free_list_max,
|
||||
mca_btl_udapl_component.udapl_free_list_inc,
|
||||
NULL);
|
||||
|
||||
ompi_free_list_init(&btl->udapl_frag_control,
|
||||
sizeof(mca_btl_udapl_frag_eager_t) +
|
||||
mca_btl_udapl_component.udapl_eager_frag_size,
|
||||
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
|
||||
mca_btl_udapl_component.udapl_free_list_num,
|
||||
-1,
|
||||
mca_btl_udapl_component.udapl_free_list_inc,
|
||||
btl->super.btl_mpool);
|
||||
ompi_free_list_init_ex(&btl->udapl_frag_control,
|
||||
sizeof(mca_btl_udapl_frag_eager_t) +
|
||||
mca_btl_udapl_component.udapl_eager_frag_size,
|
||||
mca_btl_udapl_component.udapl_buffer_alignment,
|
||||
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
|
||||
mca_btl_udapl_component.udapl_free_list_num,
|
||||
-1,
|
||||
mca_btl_udapl_component.udapl_free_list_inc,
|
||||
btl->super.btl_mpool);
|
||||
|
||||
/* initialize eager rdma buffer info */
|
||||
orte_pointer_array_init(&btl->udapl_eager_rdma_endpoints,
|
||||
|
@ -62,16 +62,6 @@ struct mca_btl_udapl_component_t {
|
||||
int32_t udapl_sr_win; /**< number of fragments recieved before
|
||||
returnting credits to sendier */
|
||||
int32_t udapl_timeout; /**< connection timeout, in microseconds */
|
||||
int32_t udapl_eager_rdma_guarantee;/**< uDAPL does not guarantee
|
||||
the order of data written to
|
||||
buffer, if the interface
|
||||
card in use guarantees front
|
||||
to back order of data
|
||||
written then this flag
|
||||
should remain as set by
|
||||
default (off) otherwise
|
||||
latency overhead will
|
||||
increase if turned on */
|
||||
size_t udapl_eager_frag_size;
|
||||
size_t udapl_max_frag_size;
|
||||
size_t udapl_eager_rdma_frag_size; /* size of the rdma fragement including data
|
||||
@ -90,6 +80,7 @@ struct mca_btl_udapl_component_t {
|
||||
recieved before returning credits to
|
||||
sender */
|
||||
int32_t udapl_async_events; /**< dequeue asynchronous events */
|
||||
int32_t udapl_buffer_alignment; /**< preferred communication buffer alignment, in bytes */
|
||||
opal_list_t udapl_procs; /**< list of udapl proc structures */
|
||||
opal_mutex_t udapl_lock; /**< lock for accessing module state */
|
||||
char* udapl_mpool_name; /**< name of memory pool */
|
||||
|
@ -618,12 +618,6 @@ int mca_btl_udapl_component_progress()
|
||||
dto = &event.event_data.dto_completion_event_data;
|
||||
|
||||
frag = dto->user_cookie.as_ptr;
|
||||
/* if we are using the "guarantee" rdma code path
|
||||
* the extra write sets cookie to NULL, when this
|
||||
* happens we ignore it because the completion
|
||||
* write event is coming
|
||||
*/
|
||||
if (frag == NULL) break;
|
||||
|
||||
/* Was the DTO successful? */
|
||||
if(DAT_DTO_SUCCESS != dto->status) {
|
||||
@ -880,7 +874,6 @@ int mca_btl_udapl_component_progress()
|
||||
for (j = 0; j < rdma_ep_count; j++) {
|
||||
mca_btl_udapl_endpoint_t* endpoint;
|
||||
mca_btl_udapl_frag_t *local_rdma_frag;
|
||||
DAT_LMR_TRIPLET local_rdma_segment;
|
||||
|
||||
endpoint =
|
||||
orte_pointer_array_get_item(btl->udapl_eager_rdma_endpoints, j);
|
||||
@ -891,19 +884,6 @@ int mca_btl_udapl_component_progress()
|
||||
MCA_BTL_UDAPL_GET_LOCAL_RDMA_FRAG(endpoint,
|
||||
endpoint->endpoint_eager_rdma_local.head);
|
||||
|
||||
/* sync local memory before checking if active
|
||||
* Question, will narrowing sync area to just the active byte
|
||||
* one, work and two, improve performance
|
||||
*/
|
||||
local_rdma_segment.lmr_context =
|
||||
local_rdma_frag->triplet.lmr_context;
|
||||
local_rdma_segment.virtual_address =
|
||||
(DAT_VADDR)local_rdma_frag->segment.seg_addr.pval;
|
||||
local_rdma_segment.segment_length = local_rdma_frag->size;
|
||||
|
||||
dat_lmr_sync_rdma_write(endpoint->endpoint_btl->udapl_ia,
|
||||
&local_rdma_segment, 1);
|
||||
|
||||
if (local_rdma_frag->rdma_ftr->active == 1) {
|
||||
int pad = 0;
|
||||
mca_btl_base_recv_reg_t* reg;
|
||||
|
@ -77,7 +77,6 @@ int mca_btl_udapl_endpoint_write_eager(mca_btl_base_endpoint_t* endpoint,
|
||||
DAT_DTO_COOKIE cookie;
|
||||
char* remote_buf;
|
||||
DAT_RMR_TRIPLET remote_buffer;
|
||||
DAT_LMR_TRIPLET local_iov; /* one contiguous write */
|
||||
int rc = OMPI_SUCCESS;
|
||||
int pad = 0;
|
||||
uint8_t head = endpoint->endpoint_eager_rdma_remote.head;
|
||||
@ -121,127 +120,32 @@ int mca_btl_udapl_endpoint_write_eager(mca_btl_base_endpoint_t* endpoint,
|
||||
frag->size -
|
||||
frag->triplet.segment_length;
|
||||
|
||||
if (mca_btl_udapl_component.udapl_eager_rdma_guarantee == 0) {
|
||||
/* execute transfer with one contiguous write */
|
||||
/* execute transfer with one contiguous write */
|
||||
|
||||
/* establish remote memory region */
|
||||
remote_buffer.rmr_context =
|
||||
(DAT_RMR_CONTEXT)endpoint->endpoint_eager_rdma_remote.rkey;
|
||||
remote_buffer.target_address = (DAT_VADDR)remote_buf;
|
||||
remote_buffer.segment_length = frag->triplet.segment_length;
|
||||
/* establish remote memory region */
|
||||
remote_buffer.rmr_context =
|
||||
(DAT_RMR_CONTEXT)endpoint->endpoint_eager_rdma_remote.rkey;
|
||||
remote_buffer.target_address = (DAT_VADDR)remote_buf;
|
||||
remote_buffer.segment_length = frag->triplet.segment_length;
|
||||
|
||||
/* write the data out */
|
||||
cookie.as_ptr = frag;
|
||||
rc = dat_ep_post_rdma_write(endpoint->endpoint_eager,
|
||||
1,
|
||||
&(frag->triplet),
|
||||
cookie,
|
||||
&remote_buffer,
|
||||
DAT_COMPLETION_DEFAULT_FLAG);
|
||||
if(DAT_SUCCESS != rc) {
|
||||
char* major;
|
||||
char* minor;
|
||||
/* write the data out */
|
||||
cookie.as_ptr = frag;
|
||||
rc = dat_ep_post_rdma_write(endpoint->endpoint_eager,
|
||||
1,
|
||||
&(frag->triplet),
|
||||
cookie,
|
||||
&remote_buffer,
|
||||
DAT_COMPLETION_DEFAULT_FLAG);
|
||||
if(DAT_SUCCESS != rc) {
|
||||
char* major;
|
||||
char* minor;
|
||||
|
||||
dat_strerror(rc, (const char**)&major, (const char**)&minor);
|
||||
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
|
||||
major, minor));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
} else {
|
||||
/* One must perform a few extra steps to guarantee that the last
|
||||
* byte written is indeed the "active" value; This is
|
||||
* acomplished by doing write-read-write; See Sections
|
||||
* 6.6.21.0.1 and 6.8.2.1 in Verion 1.2 9/15/2004 of the UDAPL Spec.
|
||||
*
|
||||
* Since the frag->triplet is already prep'ed for the non
|
||||
* guarantee single write case above, here we perform 2 writes:
|
||||
* first the data and the udapl footer, skipping the pad,
|
||||
* and then writing just the rdma footer. With the read
|
||||
* inbetween as required to guarantee delivery of the
|
||||
* second write after the first.
|
||||
*/
|
||||
|
||||
/* establish remote memory region for data and udapl footer */
|
||||
remote_buffer.rmr_context =
|
||||
(DAT_RMR_CONTEXT)endpoint->endpoint_eager_rdma_remote.rkey;
|
||||
remote_buffer.target_address = (DAT_VADDR)remote_buf;
|
||||
remote_buffer.segment_length = (frag->triplet.segment_length -
|
||||
sizeof(mca_btl_udapl_rdma_footer_t) - pad);
|
||||
|
||||
/* establish local memory region for data and udapl footer */
|
||||
local_iov.lmr_context = frag->triplet.lmr_context;
|
||||
local_iov.virtual_address = (DAT_VADDR)frag->triplet.virtual_address;
|
||||
local_iov.segment_length = (frag->triplet.segment_length -
|
||||
sizeof(mca_btl_udapl_rdma_footer_t) - pad);
|
||||
|
||||
/* write the data */
|
||||
cookie.as_ptr = NULL;
|
||||
rc = dat_ep_post_rdma_write(endpoint->endpoint_eager,
|
||||
1,
|
||||
&local_iov,
|
||||
cookie,
|
||||
&remote_buffer,
|
||||
DAT_COMPLETION_DEFAULT_FLAG);
|
||||
if(DAT_SUCCESS != rc) {
|
||||
char* major;
|
||||
char* minor;
|
||||
dat_strerror(rc, (const char**)&major, (const char**)&minor);
|
||||
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
|
||||
major, minor));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
dat_strerror(rc, (const char**)&major, (const char**)&minor);
|
||||
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
|
||||
major, minor));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* perform zero byte read of the remote memory region */
|
||||
remote_buffer.target_address = (DAT_VADDR)remote_buf;
|
||||
remote_buffer.segment_length = frag->triplet.segment_length;
|
||||
local_iov.virtual_address = (DAT_VADDR)NULL;
|
||||
local_iov.segment_length = 0;
|
||||
|
||||
cookie.as_ptr = NULL;
|
||||
rc = dat_ep_post_rdma_read(endpoint->endpoint_eager,
|
||||
0,
|
||||
&local_iov,
|
||||
cookie,
|
||||
&remote_buffer,
|
||||
DAT_COMPLETION_DEFAULT_FLAG);
|
||||
if(DAT_SUCCESS != rc) {
|
||||
char* major;
|
||||
char* minor;
|
||||
|
||||
dat_strerror(rc, (const char**)&major, (const char**)&minor);
|
||||
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_read",
|
||||
major, minor));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* establish remote memory region for rdma footer */
|
||||
remote_buffer.target_address = (DAT_VADDR)((char *)remote_buf +
|
||||
frag->triplet.segment_length - sizeof(mca_btl_udapl_rdma_footer_t));
|
||||
remote_buffer.segment_length = sizeof(mca_btl_udapl_rdma_footer_t);
|
||||
|
||||
/* establish local memory region for rdma footer */
|
||||
local_iov.virtual_address = (DAT_VADDR)(frag->rdma_ftr);
|
||||
local_iov.segment_length = sizeof(mca_btl_udapl_rdma_footer_t);
|
||||
|
||||
/* write the footer */
|
||||
cookie.as_ptr = frag;
|
||||
rc = dat_ep_post_rdma_write(endpoint->endpoint_eager,
|
||||
1,
|
||||
&local_iov,
|
||||
cookie,
|
||||
&remote_buffer,
|
||||
DAT_COMPLETION_DEFAULT_FLAG);
|
||||
if(DAT_SUCCESS != rc) {
|
||||
char* major;
|
||||
char* minor;
|
||||
|
||||
dat_strerror(rc, (const char**)&major, (const char**)&minor);
|
||||
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
|
||||
major, minor));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Sandia National Laboratories. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -73,9 +73,10 @@ typedef struct mca_btl_udapl_footer_t mca_btl_udapl_footer_t;
|
||||
*/
|
||||
struct mca_btl_udapl_rdma_footer_t {
|
||||
uint32_t size;
|
||||
uint8_t active; /* 0 = not in use; 1 = data is available to be received;
|
||||
* this should always be the last entry in this structure
|
||||
*/
|
||||
volatile uint8_t active;/* 0 = not in use; 1 = data is available to be
|
||||
* received; this should always be the last entry
|
||||
* in this structure
|
||||
*/
|
||||
char pad[3]; /* pad out be aligned on MCA_BTL_UDAPL_FRAG_ALIGN byte boundary */
|
||||
};
|
||||
typedef struct mca_btl_udapl_rdma_footer_t mca_btl_udapl_rdma_footer_t;
|
||||
|
@ -200,14 +200,6 @@ int mca_btl_udapl_register_mca_params(void)
|
||||
REGINT_GE_ONE), tmp_rc, rc);
|
||||
mca_btl_udapl_component.udapl_timeout = (uint32_t) ival;
|
||||
|
||||
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("eager_rdma_guarantee",
|
||||
"If the interface card in use guarantees front to back order "
|
||||
"of data written then this flag should remain as set by "
|
||||
"default (off).",
|
||||
0,
|
||||
&mca_btl_udapl_component.udapl_eager_rdma_guarantee,
|
||||
REGINT_GE_ZERO), tmp_rc, rc);
|
||||
|
||||
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("async_events",
|
||||
"The asynchronous event queue will only be "
|
||||
"checked after entering progress this number of times.",
|
||||
@ -215,6 +207,13 @@ int mca_btl_udapl_register_mca_params(void)
|
||||
&mca_btl_udapl_component.udapl_async_events,
|
||||
REGINT_GE_ONE), tmp_rc, rc);
|
||||
|
||||
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("buffer_alignment",
|
||||
"Preferred communication buffer alignment, "
|
||||
"in bytes (must be >= 1).",
|
||||
DAT_OPTIMAL_ALIGNMENT,
|
||||
&mca_btl_udapl_component.udapl_buffer_alignment,
|
||||
REGINT_GE_ONE), tmp_rc, rc);
|
||||
|
||||
/* register uDAPL module parameters */
|
||||
CHECK_PARAM_REGISTER_RETURN_VALUE(mca_btl_udapl_reg_int("evd_qlen",
|
||||
"The event dispatcher queue length.",
|
||||
|
@ -30,3 +30,12 @@ value, e.g. 16.
|
||||
|
||||
WARNING: Using default uDAPL endpoint parameters not those that
|
||||
would have been modified by MCA parameters.
|
||||
|
||||
[optimal buffer alignment mismatch]
|
||||
|
||||
WARNING: DAT_OPTIMAL_ALIGNMENT = %d : BTL buffer_alignment = %d.
|
||||
The BTL buffer_alignment value may not be optimal. If all nodes
|
||||
report the same DAT_OPTIMAL_ALIGNMENT value and this differs from
|
||||
BTL buffer_alignment then setting "--mca btl_udapl_buffer_alignment
|
||||
%d" may improve performance.
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user