osc/rdma: fix hang when performing large unaligned gets
This commit adds code to handle large unaligned gets. There are two possible code paths for these transactions: 1) The remote region and local region have the same alignment. In this case the get will be broken down into at most three get transactions: 1 transaction to get the unaligned start of the region (buffered), 1 transaction to get the aligned portion of the region, and 1 transaction to get the end of the region. 2) The remote and local regions do not have the same alignment. This should be an uncommon case and is not optimized. In this case a buffer is allocated and registered locally to hold the aligned data from the remote region. There may be cases where this fails (low memory, can't register memory). Those conditions are unlikely and will be handled later. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
70787d1574
Коммит
45da311473
@ -15,6 +15,11 @@
|
||||
#include "osc_rdma_dynamic.h"
|
||||
|
||||
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
|
||||
#include "opal/align.h"
|
||||
|
||||
static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
|
||||
mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
|
||||
ompi_osc_rdma_request_t *request);
|
||||
|
||||
static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
void *local_address, mca_btl_base_registration_handle_t *local_handle,
|
||||
@ -136,7 +141,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc
|
||||
ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
|
||||
mca_btl_base_registration_handle_t *remote_handle, int remote_count,
|
||||
ompi_datatype_t *remote_datatype, ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
|
||||
const ompi_osc_rdma_fn_t rdma_fn,const bool alloc_reqs)
|
||||
const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs)
|
||||
{
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
struct iovec local_iovec[OMPI_OSC_RDMA_DECODE_MAX], remote_iovec[OMPI_OSC_RDMA_DECODE_MAX];
|
||||
@ -575,11 +580,13 @@ static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struc
|
||||
|
||||
assert (OPAL_SUCCESS == status);
|
||||
|
||||
if (NULL != frag) {
|
||||
if (request->buffer || NULL != frag) {
|
||||
if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
|
||||
memcpy (origin_addr, (void *) source, request->len);
|
||||
}
|
||||
}
|
||||
|
||||
if (NULL != frag) {
|
||||
ompi_osc_rdma_frag_complete (frag);
|
||||
} else {
|
||||
ompi_osc_rdma_deregister (sync->module, local_handle);
|
||||
@ -621,6 +628,27 @@ int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer)
|
||||
|
||||
}
|
||||
|
||||
static int ompi_osc_rdma_get_partial (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
|
||||
mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
|
||||
ompi_osc_rdma_request_t *request) {
|
||||
ompi_osc_rdma_module_t *module = sync->module;
|
||||
ompi_osc_rdma_request_t *subreq;
|
||||
int ret;
|
||||
|
||||
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
|
||||
subreq->internal = true;
|
||||
subreq->type = OMPI_OSC_RDMA_TYPE_RDMA;
|
||||
subreq->parent_request = request;
|
||||
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1);
|
||||
|
||||
ret = ompi_osc_rdma_get_contig (sync, peer, source_address, source_handle, target_buffer, size, subreq);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
|
||||
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
|
||||
mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
|
||||
@ -639,33 +667,81 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
|
||||
aligned_source_bound = (source_address + size + btl_alignment_mask) & ~btl_alignment_mask;
|
||||
aligned_len = aligned_source_bound - aligned_source_base;
|
||||
|
||||
request->offset = source_address - aligned_source_base;
|
||||
request->len = size;
|
||||
request->origin_addr = target_buffer;
|
||||
request->sync = sync;
|
||||
|
||||
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p",
|
||||
size, source_address, target_buffer);
|
||||
|
||||
if ((module->selected_btl->btl_register_mem && size > module->selected_btl->btl_get_local_registration_threshold) ||
|
||||
(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
|
||||
|
||||
ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
/* check for alignment */
|
||||
if (!(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
|
||||
(void) ompi_osc_rdma_register (module, peer->data_endpoint, target_buffer, size, MCA_BTL_REG_FLAG_LOCAL_WRITE,
|
||||
if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) {
|
||||
/* region is too large for a buffered read */
|
||||
size_t subsize;
|
||||
|
||||
if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) {
|
||||
/* remote region has the same alignment but the base is not aligned. perform a small
|
||||
* buffered get of the beginning of the remote region */
|
||||
aligned_source_base = OPAL_ALIGN(source_address, module->selected_btl->btl_get_alignment, osc_rdma_base_t);
|
||||
subsize = (size_t) (aligned_source_base - source_address);
|
||||
|
||||
ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
source_address += subsize;
|
||||
target_buffer = (void *) ((intptr_t) target_buffer + subsize);
|
||||
size -= subsize;
|
||||
|
||||
aligned_len = aligned_source_bound - aligned_source_base;
|
||||
}
|
||||
|
||||
if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) &&
|
||||
(size & btl_alignment_mask)) {
|
||||
/* remote region bases are aligned but the bounds are not. perform a
|
||||
* small buffered get of the end of the remote region */
|
||||
aligned_len = size & ~btl_alignment_mask;
|
||||
subsize = size - aligned_len;
|
||||
size = aligned_len;
|
||||
ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle,
|
||||
(void *) ((intptr_t) target_buffer + aligned_len), subsize, request);
|
||||
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
/* (remaining) user request is now correctly aligned */
|
||||
}
|
||||
|
||||
if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
|
||||
/* local and remote alignments differ */
|
||||
request->buffer = ptr = malloc (aligned_len);
|
||||
} else {
|
||||
ptr = target_buffer;
|
||||
}
|
||||
|
||||
if (NULL != ptr) {
|
||||
(void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE,
|
||||
&local_handle);
|
||||
}
|
||||
|
||||
if (OPAL_UNLIKELY(NULL == local_handle)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
free (request->buffer);
|
||||
request->buffer = NULL;
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get", ptr, (void *) frag);
|
||||
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx",
|
||||
ptr, (void *) frag, aligned_len, (unsigned long) aligned_source_base);
|
||||
local_handle = frag->handle;
|
||||
}
|
||||
}
|
||||
|
||||
request->offset = source_address - aligned_source_base;
|
||||
request->len = size;
|
||||
request->origin_addr = target_buffer;
|
||||
request->sync = sync;
|
||||
|
||||
ompi_osc_rdma_sync_rdma_inc (sync);
|
||||
|
||||
do {
|
||||
|
@ -60,7 +60,7 @@ static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size
|
||||
request_len = OPAL_ALIGN(request_len, 8, size_t);
|
||||
|
||||
if (request_len > (mca_osc_rdma_component.buffer_size >> 1)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
@ -50,7 +50,7 @@ static inline int ompi_osc_rdma_lock_release_shared (ompi_osc_rdma_module_t *mod
|
||||
{
|
||||
uint64_t lock = (uint64_t) (intptr_t) peer->state + offset;
|
||||
volatile bool atomic_complete = false;
|
||||
void *temp;
|
||||
void *temp = NULL;
|
||||
int ret;
|
||||
|
||||
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing shared lock %" PRIx64 " on peer %d. value 0x%lx", lock,
|
||||
@ -117,7 +117,7 @@ static inline int ompi_osc_rdma_lock_acquire_shared (ompi_osc_rdma_module_t *mod
|
||||
{
|
||||
uint64_t lock = (uint64_t) peer->state + offset;
|
||||
volatile bool atomic_complete;
|
||||
ompi_osc_rdma_lock_t *temp;
|
||||
ompi_osc_rdma_lock_t *temp = NULL;
|
||||
int ret;
|
||||
|
||||
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring shared lock %" PRIx64 " on peer %d. value 0x%lx", lock,
|
||||
@ -292,7 +292,7 @@ static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t *
|
||||
{
|
||||
uint64_t lock = (uint64_t) (intptr_t) peer->state + offset;
|
||||
volatile bool atomic_complete = false;
|
||||
void *temp;
|
||||
void *temp = NULL;
|
||||
int ret;
|
||||
|
||||
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d", lock, peer->rank);
|
||||
|
@ -59,7 +59,10 @@ static void request_construct(ompi_osc_rdma_request_t *request)
|
||||
request->super.req_free = request_free;
|
||||
request->super.req_cancel = request_cancel;
|
||||
request->super.req_complete_cb = request_complete;
|
||||
request->parent_request = 0;
|
||||
request->parent_request = NULL;
|
||||
request->buffer = NULL;
|
||||
request->internal = false;
|
||||
request->outstanding_requests = 0;
|
||||
OBJ_CONSTRUCT(&request->convertor, opal_convertor_t);
|
||||
}
|
||||
|
||||
|
@ -60,6 +60,7 @@ struct ompi_osc_rdma_request_t {
|
||||
|
||||
/** synchronization object */
|
||||
struct ompi_osc_rdma_sync_t *sync;
|
||||
void *buffer;
|
||||
};
|
||||
typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t;
|
||||
OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
|
||||
@ -78,18 +79,19 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
|
||||
req = (ompi_osc_rdma_request_t*) item; \
|
||||
OMPI_REQUEST_INIT(&req->super, false); \
|
||||
req->super.req_mpi_object.win = module->win; \
|
||||
req->super.req_complete = false; \
|
||||
req->super.req_state = OMPI_REQUEST_ACTIVE; \
|
||||
req->module = rmodule; \
|
||||
req->internal = false; \
|
||||
req->outstanding_requests = 0; \
|
||||
req->parent_request = NULL; \
|
||||
req->peer = (rpeer); \
|
||||
} while (0)
|
||||
|
||||
#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \
|
||||
do { \
|
||||
OMPI_REQUEST_FINI(&(req)->super); \
|
||||
free ((req)->buffer); \
|
||||
(req)->buffer = NULL; \
|
||||
(req)->parent_request = NULL; \
|
||||
(req)->internal = false; \
|
||||
(req)->outstanding_requests = 0; \
|
||||
opal_free_list_return (&mca_osc_rdma_component.requests, \
|
||||
(opal_free_list_item_t *) (req)); \
|
||||
} while (0)
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user