1
1

Merge pull request #1320 from hjelmn/osc_rdma_fix

osc/rdma: fix hang when performing large unaligned gets
Этот коммит содержится в:
Nathan Hjelm 2016-01-25 09:36:13 -07:00
родитель ad3aa3879a 45da311473
Коммит 500e90422d
5 изменённых файлов: 102 добавлений и 21 удалений

Просмотреть файл

@ -15,6 +15,11 @@
#include "osc_rdma_dynamic.h"
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
#include "opal/align.h"
static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
ompi_osc_rdma_request_t *request);
static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
@ -136,7 +141,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc
ompi_osc_rdma_peer_t *peer, uint64_t remote_address,
mca_btl_base_registration_handle_t *remote_handle, int remote_count,
ompi_datatype_t *remote_datatype, ompi_osc_rdma_request_t *request, const size_t max_rdma_len,
const ompi_osc_rdma_fn_t rdma_fn,const bool alloc_reqs)
const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs)
{
ompi_osc_rdma_module_t *module = sync->module;
struct iovec local_iovec[OMPI_OSC_RDMA_DECODE_MAX], remote_iovec[OMPI_OSC_RDMA_DECODE_MAX];
@ -575,11 +580,13 @@ static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struc
assert (OPAL_SUCCESS == status);
if (NULL != frag) {
if (request->buffer || NULL != frag) {
if (OPAL_LIKELY(OMPI_SUCCESS == status)) {
memcpy (origin_addr, (void *) source, request->len);
}
}
if (NULL != frag) {
ompi_osc_rdma_frag_complete (frag);
} else {
ompi_osc_rdma_deregister (sync->module, local_handle);
@ -621,6 +628,27 @@ int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer)
}
static int ompi_osc_rdma_get_partial (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
ompi_osc_rdma_request_t *request) {
ompi_osc_rdma_module_t *module = sync->module;
ompi_osc_rdma_request_t *subreq;
int ret;
OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
subreq->internal = true;
subreq->type = OMPI_OSC_RDMA_TYPE_RDMA;
subreq->parent_request = request;
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1);
ret = ompi_osc_rdma_get_contig (sync, peer, source_address, source_handle, target_buffer, size, subreq);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
(void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1);
}
return ret;
}
static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address,
mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size,
@ -639,33 +667,81 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
aligned_source_bound = (source_address + size + btl_alignment_mask) & ~btl_alignment_mask;
aligned_len = aligned_source_bound - aligned_source_base;
request->offset = source_address - aligned_source_base;
request->len = size;
request->origin_addr = target_buffer;
request->sync = sync;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p",
size, source_address, target_buffer);
if ((module->selected_btl->btl_register_mem && size > module->selected_btl->btl_get_local_registration_threshold) ||
(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
/* check for alignment */
if (!(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
(void) ompi_osc_rdma_register (module, peer->data_endpoint, target_buffer, size, MCA_BTL_REG_FLAG_LOCAL_WRITE,
if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) {
/* region is too large for a buffered read */
size_t subsize;
if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) {
/* remote region has the same alignment but the base is not aligned. perform a small
* buffered get of the beginning of the remote region */
aligned_source_base = OPAL_ALIGN(source_address, module->selected_btl->btl_get_alignment, osc_rdma_base_t);
subsize = (size_t) (aligned_source_base - source_address);
ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
source_address += subsize;
target_buffer = (void *) ((intptr_t) target_buffer + subsize);
size -= subsize;
aligned_len = aligned_source_bound - aligned_source_base;
}
if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) &&
(size & btl_alignment_mask)) {
/* remote region bases are aligned but the bounds are not. perform a
* small buffered get of the end of the remote region */
aligned_len = size & ~btl_alignment_mask;
subsize = size - aligned_len;
size = aligned_len;
ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle,
(void *) ((intptr_t) target_buffer + aligned_len), subsize, request);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
}
/* (remaining) user request is now correctly aligned */
}
if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) {
/* local and remote alignments differ */
request->buffer = ptr = malloc (aligned_len);
} else {
ptr = target_buffer;
}
if (NULL != ptr) {
(void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE,
&local_handle);
}
if (OPAL_UNLIKELY(NULL == local_handle)) {
return OMPI_ERR_OUT_OF_RESOURCE;
free (request->buffer);
request->buffer = NULL;
return ret;
}
} else {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get", ptr, (void *) frag);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx",
ptr, (void *) frag, aligned_len, (unsigned long) aligned_source_base);
local_handle = frag->handle;
}
}
request->offset = source_address - aligned_source_base;
request->len = size;
request->origin_addr = target_buffer;
request->sync = sync;
ompi_osc_rdma_sync_rdma_inc (sync);
do {

Просмотреть файл

@ -60,7 +60,7 @@ static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size
request_len = OPAL_ALIGN(request_len, 8, size_t);
if (request_len > (mca_osc_rdma_component.buffer_size >> 1)) {
return OMPI_ERR_OUT_OF_RESOURCE;
return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
}
OPAL_THREAD_LOCK(&module->lock);

Просмотреть файл

@ -50,7 +50,7 @@ static inline int ompi_osc_rdma_lock_release_shared (ompi_osc_rdma_module_t *mod
{
uint64_t lock = (uint64_t) (intptr_t) peer->state + offset;
volatile bool atomic_complete = false;
void *temp;
void *temp = NULL;
int ret;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing shared lock %" PRIx64 " on peer %d. value 0x%lx", lock,
@ -117,7 +117,7 @@ static inline int ompi_osc_rdma_lock_acquire_shared (ompi_osc_rdma_module_t *mod
{
uint64_t lock = (uint64_t) peer->state + offset;
volatile bool atomic_complete;
ompi_osc_rdma_lock_t *temp;
ompi_osc_rdma_lock_t *temp = NULL;
int ret;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring shared lock %" PRIx64 " on peer %d. value 0x%lx", lock,
@ -292,7 +292,7 @@ static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t *
{
uint64_t lock = (uint64_t) (intptr_t) peer->state + offset;
volatile bool atomic_complete = false;
void *temp;
void *temp = NULL;
int ret;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d", lock, peer->rank);

Просмотреть файл

@ -59,7 +59,10 @@ static void request_construct(ompi_osc_rdma_request_t *request)
request->super.req_free = request_free;
request->super.req_cancel = request_cancel;
request->super.req_complete_cb = request_complete;
request->parent_request = 0;
request->parent_request = NULL;
request->buffer = NULL;
request->internal = false;
request->outstanding_requests = 0;
OBJ_CONSTRUCT(&request->convertor, opal_convertor_t);
}

Просмотреть файл

@ -60,6 +60,7 @@ struct ompi_osc_rdma_request_t {
/** synchronization object */
struct ompi_osc_rdma_sync_t *sync;
void *buffer;
};
typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
@ -78,18 +79,19 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t);
req = (ompi_osc_rdma_request_t*) item; \
OMPI_REQUEST_INIT(&req->super, false); \
req->super.req_mpi_object.win = module->win; \
req->super.req_complete = false; \
req->super.req_state = OMPI_REQUEST_ACTIVE; \
req->module = rmodule; \
req->internal = false; \
req->outstanding_requests = 0; \
req->parent_request = NULL; \
req->peer = (rpeer); \
} while (0)
#define OMPI_OSC_RDMA_REQUEST_RETURN(req) \
do { \
OMPI_REQUEST_FINI(&(req)->super); \
free ((req)->buffer); \
(req)->buffer = NULL; \
(req)->parent_request = NULL; \
(req)->internal = false; \
(req)->outstanding_requests = 0; \
opal_free_list_return (&mca_osc_rdma_component.requests, \
(opal_free_list_item_t *) (req)); \
} while (0)