1
1
This commit fixes the following bugs:

 - Allow a btl to be used for communication if it can communicate with
   all non-self peers and it supports global atomic visibility. In
   this case CPU atomics can be used for self and the btl for any
   other peer.

 - It was possible to get into a state where different threads of an
   MPI process could issue conflicting accumulate operations to a
   remote peer. To eliminate this race we now update the peer flags
   atomically.

 - Queue up and re-issue put operations that failed during a BTL
   callback. This can occur during an accumulate operation. This was
   an unhandled error case.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2017-11-20 12:21:50 -07:00 коммит произвёл Nathan Hjelm
родитель 67e26b6e5a
Коммит 45db3637af
5 изменённых файлов: 145 добавлений и 21 удалений

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
* Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
@ -18,6 +18,90 @@
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
enum ompi_osc_rdma_event_type_t {
OMPI_OSC_RDMA_EVENT_TYPE_PUT,
};
typedef enum ompi_osc_rdma_event_type_t ompi_osc_rdma_event_type_t;
struct ompi_osc_rdma_event_t {
opal_event_t super;
ompi_osc_rdma_module_t *module;
struct mca_btl_base_endpoint_t *endpoint;
void *local_address;
mca_btl_base_registration_handle_t *local_handle;
uint64_t remote_address;
mca_btl_base_registration_handle_t *remote_handle;
uint64_t length;
mca_btl_base_rdma_completion_fn_t cbfunc;
void *cbcontext;
void *cbdata;
};
typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t;
static void *ompi_osc_rdma_event_put (int fd, int flags, void *context)
{
ompi_osc_rdma_event_t *event = (ompi_osc_rdma_event_t *) context;
int ret;
ret = event->module->selected_btl->btl_put (event->module->selected_btl, event->endpoint, event->local_address,
event->remote_address, event->local_handle, event->remote_handle,
event->length, 0, MCA_BTL_NO_ORDER, event->cbfunc, event->cbcontext,
event->cbdata);
if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
/* done with this event */
opal_event_del (&event->super);
free (event);
} else {
/* re-activate the event */
opal_event_active (&event->super, OPAL_EV_READ, 1);
}
return NULL;
}
static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
ompi_osc_rdma_event_type_t event_type, void *local_address, mca_btl_base_registration_handle_t *local_handle,
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
uint64_t length, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
void *cbdata)
{
ompi_osc_rdma_event_t *event = malloc (sizeof (*event));
void *(*event_func) (int, int, void *);
if (OPAL_UNLIKELY(NULL == event)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
event->module = module;
event->endpoint = endpoint;
event->local_address = local_address;
event->local_handle = local_handle;
event->remote_address = remote_address;
event->remote_handle = remote_handle;
event->length = length;
event->cbfunc = cbfunc;
event->cbcontext = cbcontext;
event->cbdata = cbdata;
switch (event_type) {
case OMPI_OSC_RDMA_EVENT_TYPE_PUT:
event_func = ompi_osc_rdma_event_put;
break;
default:
opal_output(0, "osc/rdma: cannot queue unknown event type %d", event_type);
abort ();
}
opal_event_set (opal_sync_event_base, &event->super, -1, OPAL_EV_READ,
event_func, event);
opal_event_active (&event->super, OPAL_EV_READ, 1);
return OMPI_SUCCESS;
}
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
@ -113,7 +197,7 @@ static void ompi_osc_rdma_acc_put_complete (struct mca_btl_base_module_t *btl, s
}
ompi_osc_rdma_sync_rdma_dec (sync);
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
ompi_osc_rdma_peer_clear_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING);
}
/* completion of an accumulate get operation */
@ -171,7 +255,12 @@ static void ompi_osc_rdma_acc_get_complete (struct mca_btl_base_module_t *btl, s
(mca_btl_base_registration_handle_t *) request->ctx,
request->len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete,
request, NULL);
/* TODO -- we can do better. probably should queue up the next step and handle it in progress */
if (OPAL_SUCCESS != status) {
status = ompi_osc_rdma_event_queue (module, endpoint, OMPI_OSC_RDMA_EVENT_TYPE_PUT, (void *) source, local_handle,
request->target_address, (mca_btl_base_registration_handle_t *) request->ctx,
request->len, ompi_osc_rdma_acc_put_complete, request, NULL);
}
assert (OPAL_SUCCESS == status);
}
@ -203,13 +292,12 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
OPAL_THREAD_LOCK(&module->lock);
/* to ensure order wait until the previous accumulate completes */
while (ompi_osc_rdma_peer_is_accumulating (peer)) {
while (!ompi_osc_rdma_peer_test_set_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING)) {
OPAL_THREAD_UNLOCK(&module->lock);
ompi_osc_rdma_progress (module);
OPAL_THREAD_LOCK(&module->lock);
}
peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING;
OPAL_THREAD_UNLOCK(&module->lock);
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
@ -847,11 +935,12 @@ static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, s
ompi_osc_rdma_acc_put_complete, request, NULL);
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "could not start put to complete accumulate operation. opal return code "
"%d", ret);
}
"%d. queuing operation...", ret);
/* TODO -- we can do better. probably should queue up the next step and handle it in progress */
assert (OPAL_SUCCESS == ret);
ret = ompi_osc_rdma_event_queue (module, peer->data_endpoint, OMPI_OSC_RDMA_EVENT_TYPE_PUT, local_address, local_handle,
request->target_address, (mca_btl_base_registration_handle_t *) request->ctx, request->len,
ompi_osc_rdma_acc_put_complete, request, NULL);
}
return;
}
@ -868,7 +957,7 @@ static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, s
ompi_osc_rdma_request_complete (request, status);
ompi_osc_rdma_sync_rdma_dec (sync);
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
ompi_osc_rdma_peer_clear_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING);
}
static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, void *result_addr,
@ -894,12 +983,11 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
OPAL_THREAD_LOCK(&module->lock);
/* to ensure order wait until the previous accumulate completes */
while (ompi_osc_rdma_peer_is_accumulating (peer)) {
while (!ompi_osc_rdma_peer_test_set_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING)) {
OPAL_THREAD_UNLOCK(&module->lock);
ompi_osc_rdma_progress (module);
OPAL_THREAD_LOCK(&module->lock);
}
peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING;
OPAL_THREAD_UNLOCK(&module->lock);
offset = target_address & btl_alignment_mask;;

Просмотреть файл

@ -160,7 +160,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc
subreq = NULL;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "scheduling rdma on non-contiguous datatype(s)");
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "scheduling rdma on non-contiguous datatype(s) or large region");
/* prepare convertors for the source and target. these convertors will be used to determine the
* contiguous segments within the source and target. */

Просмотреть файл

@ -850,11 +850,18 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_b
}
for (int i = 0 ; i < max_btls ; ++i) {
int btl_count = btl_counts[i];
if (NULL == possible_btls[i]) {
break;
}
if (btl_counts[i] == comm_size && possible_btls[i]->btl_latency < selected_latency) {
if (possible_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) {
/* do not need to use the btl for self communication */
btl_count++;
}
if (btl_count >= comm_size && possible_btls[i]->btl_latency < selected_latency) {
selected_btl = possible_btls[i];
selected_latency = possible_btls[i]->btl_latency;
}

Просмотреть файл

@ -61,7 +61,8 @@ int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id,
*peer_out = NULL;
endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_id);
if (OPAL_UNLIKELY(NULL == endpoint)) {
if (OPAL_UNLIKELY(NULL == endpoint && !((module->selected_btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) &&
peer_id == ompi_comm_rank (module->comm)))) {
return OMPI_ERR_UNREACH;
}

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
@ -40,7 +40,7 @@ struct ompi_osc_rdma_peer_t {
int rank;
/** peer flags */
int flags;
volatile int32_t flags;
/** aggregation support */
ompi_osc_rdma_aggregation_t *aggregate;
@ -188,13 +188,41 @@ static inline bool ompi_osc_rdma_peer_is_exclusive (ompi_osc_rdma_peer_t *peer)
}
/**
* @brief check if this process is currently accumulating on a peer
* @brief try to set a flag on a peer object
*
* @param[in] peer peer object to check
* @param[in] peer peer object to modify
* @param[in] flag flag to set
*
* @returns true if the flag was not already set
* @returns flase otherwise
*/
static inline bool ompi_osc_rdma_peer_is_accumulating (ompi_osc_rdma_peer_t *peer)
static inline bool ompi_osc_rdma_peer_test_set_flag (ompi_osc_rdma_peer_t *peer, int flag)
{
return !!(peer->flags & OMPI_OSC_RDMA_PEER_ACCUMULATING);
int32_t flags;
opal_atomic_mb ();
do {
flags = peer->flags;
if (flags & flag) {
return false;
}
} while (!OPAL_THREAD_BOOL_CMPSET_32 (&peer->flags, flags, flags | flag));
return true;
}
/**
* @brief clear a flag from a peer object
*
* @param[in] peer peer object to modify
* @param[in] flag flag to set
*/
static inline void ompi_osc_rdma_peer_clear_flag (ompi_osc_rdma_peer_t *peer, int flag)
{
OPAL_ATOMIC_AND32(&peer->flags, ~flag);
opal_atomic_mb ();
}
/**