1
1

osc/rdma: rework locking code to improve behavior of unlock

This commit changes the locking code to allow the lock release to be
non-blocking. This helps with releasing the accumulate lock which may
occur in a BTL callback.

Fixes #3616

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2017-06-21 12:42:50 -06:00 коммит произвёл Nathan Hjelm
родитель 055ce802cd
Коммит 022c658bbf
3 изменённых файлов: 136 добавлений и 130 удалений

Просмотреть файл

@ -48,6 +48,27 @@ typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t;
static OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL);
static void ompi_osc_rdma_pending_op_construct (ompi_osc_rdma_pending_op_t *pending_op)
{
pending_op->op_frag = NULL;
pending_op->op_buffer = NULL;
pending_op->op_result = NULL;
pending_op->op_complete = false;
}
static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pending_op)
{
if (NULL != pending_op->op_frag) {
ompi_osc_rdma_frag_complete (pending_op->op_frag);
}
ompi_osc_rdma_pending_op_construct (pending_op);
}
OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_op_t, opal_list_item_t,
ompi_osc_rdma_pending_op_construct,
ompi_osc_rdma_pending_op_destruct);
/**
* Dummy completion function for atomic operations
*/
@ -55,11 +76,19 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
volatile bool *atomic_complete = (volatile bool *) context;
ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context;
if (atomic_complete) {
*atomic_complete = true;
if (pending_op->op_result) {
memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size);
}
if (NULL != pending_op->op_frag) {
ompi_osc_rdma_frag_complete (pending_op->op_frag);
pending_op->op_frag = NULL;
}
pending_op->op_complete = true;
OBJ_RELEASE(pending_op);
}
/**
@ -182,9 +211,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
ompi_osc_rdma_peer_t **peers;
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_state_t *state = module->state;
volatile bool atomic_complete;
ompi_osc_rdma_frag_t *frag = NULL;
osc_rdma_counter_t *temp = NULL;
int ret;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post: %p, %d, %s", (void*) group, assert, win->w_name);
@ -212,9 +238,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
state->num_complete_msgs = 0;
OPAL_THREAD_UNLOCK(&module->lock);
/* allocate a temporary buffer for atomic response */
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
if ((assert & MPI_MODE_NOCHECK) || 0 == ompi_group_size (group)) {
return OMPI_SUCCESS;
}
@ -226,7 +249,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
/* translate group ranks into the communicator */
peers = ompi_osc_rdma_get_peers (module, module->pw_group);
if (OPAL_UNLIKELY(NULL == peers)) {
ompi_osc_rdma_frag_complete (frag);
return OMPI_ERR_OUT_OF_RESOURCE;
}
@ -236,7 +258,7 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) {
ompi_osc_rdma_peer_t *peer = peers[i];
uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index);
int post_index;
ompi_osc_rdma_lock_t post_index;
if (peer->rank == my_rank) {
ompi_osc_rdma_handle_post (module, my_rank, NULL, 0);
@ -244,57 +266,32 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
}
/* get a post index */
atomic_complete = false;
if (!ompi_osc_rdma_peer_local_state (peer)) {
do {
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, target, frag->handle,
peer->state_handle, MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
assert (OPAL_SUCCESS >= ret);
if (OMPI_SUCCESS == ret) {
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}
break;
}
ompi_osc_rdma_progress (module);
} while (1);
ret = ompi_osc_rdma_lock_btl_fop (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, &post_index, true);
assert (OMPI_SUCCESS == ret);
} else {
*temp = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
post_index = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
}
post_index = (*temp) & (OMPI_OSC_RDMA_POST_PEER_MAX - 1);
post_index &= OMPI_OSC_RDMA_POST_PEER_MAX - 1;
target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) +
sizeof (osc_rdma_counter_t) * post_index;
do {
ompi_osc_rdma_lock_t result;
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", post_index, peer->rank);
/* try to post. if the value isn't 0 then another rank is occupying this index */
if (!ompi_osc_rdma_peer_local_state (peer)) {
atomic_complete = false;
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, target, frag->handle, peer->state_handle,
0, 1 + (int64_t) my_rank, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) &atomic_complete, NULL);
assert (OPAL_SUCCESS >= ret);
if (OMPI_SUCCESS == ret) {
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}
} else {
ompi_osc_rdma_progress (module);
continue;
}
ret = ompi_osc_rdma_lock_btl_cswap (module, peer, target, 0, 1 + (int64_t) my_rank, &result);
assert (OMPI_SUCCESS == ret);
} else {
*temp = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank);
result = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank);
}
if (OPAL_LIKELY(0 == *temp)) {
if (OPAL_LIKELY(0 == result)) {
break;
}
@ -313,8 +310,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
} while (1);
}
ompi_osc_rdma_frag_complete (frag);
ompi_osc_rdma_release_peers (peers, ompi_group_size(module->pw_group));
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post complete");
@ -422,9 +417,7 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *sync = &module->all_sync;
ompi_osc_rdma_frag_t *frag = NULL;
ompi_osc_rdma_peer_t **peers;
void *scratch_lock = NULL;
ompi_group_t *group;
int group_size, ret;
@ -459,45 +452,19 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win)
ompi_osc_rdma_sync_rdma_complete (sync);
if (!(MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags)) {
/* need a temporary buffer for performing fetching atomics */
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &scratch_lock);
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
return ret;
}
}
/* for each process in the group increment their number of complete messages */
for (int i = 0 ; i < group_size ; ++i) {
ompi_osc_rdma_peer_t *peer = peers[i];
intptr_t target = (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, num_complete_msgs);
if (!ompi_osc_rdma_peer_local_state (peer)) {
do {
if (MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags) {
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, target, peer->state_handle,
MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_atomic_complete, NULL, NULL);
} else {
/* don't care about the read value so use the scratch lock */
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, scratch_lock,
target, frag->handle, peer->state_handle, MCA_BTL_ATOMIC_ADD, 1,
0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, NULL, NULL);
}
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
break;
}
} while (1);
ret = ompi_osc_rdma_lock_btl_op (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, true);
assert (OMPI_SUCCESS == ret);
} else {
(void) ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) target, 1);
}
}
if (frag) {
ompi_osc_rdma_frag_complete (frag);
}
/* release our reference to peers in this group */
ompi_osc_rdma_release_peers (peers, group_size);

Просмотреть файл

@ -34,23 +34,34 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b
__opal_attribute_always_inline__
static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address,
int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result)
int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result,
const bool wait_for_completion)
{
volatile bool atomic_complete = false;
ompi_osc_rdma_frag_t *frag = NULL;
ompi_osc_rdma_lock_t *temp = NULL;
ompi_osc_rdma_pending_op_t *pending_op;
int ret;
pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t);
assert (NULL != pending_op);
if (wait_for_completion) {
OBJ_RETAIN(pending_op);
}
pending_op->op_result = (void *) result;
pending_op->op_size = sizeof (ompi_osc_rdma_lock_t);
OBJ_RETAIN(pending_op);
/* spin until the btl has accepted the operation */
do {
if (NULL == frag) {
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
if (NULL == pending_op->op_frag) {
ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer);
}
if (NULL != frag) {
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, (intptr_t) address,
frag->handle, peer->state_handle, op, operand, 0,
MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, (void *) &atomic_complete,
NULL);
if (NULL != pending_op->op_frag) {
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, pending_op->op_buffer,
(intptr_t) address, pending_op->op_frag->handle, peer->state_handle,
op, operand, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) pending_op, NULL);
}
if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) {
@ -59,40 +70,43 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om
ompi_osc_rdma_progress (module);
} while (1);
if (OPAL_SUCCESS == ret) {
while (!atomic_complete) {
if (OPAL_SUCCESS != ret) {
/* need to release here because ompi_osc_rdma_atomic_complet was not called */
OBJ_RELEASE(pending_op);
if (OPAL_LIKELY(1 == ret)) {
ret = OMPI_SUCCESS;
}
} else if (wait_for_completion) {
while (!pending_op->op_complete) {
ompi_osc_rdma_progress (module);
}
} else if (1 == ret) {
ret = OMPI_SUCCESS;
}
if (NULL != frag) {
if (result) {
*result = *temp;
}
ompi_osc_rdma_frag_complete (frag);
}
OBJ_RELEASE(pending_op);
return ret;
}
__opal_attribute_always_inline__
static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address,
int op, ompi_osc_rdma_lock_t operand)
int op, ompi_osc_rdma_lock_t operand, const bool wait_for_completion)
{
volatile bool atomic_complete = false;
ompi_osc_rdma_pending_op_t *pending_op;
int ret;
if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
return ompi_osc_rdma_lock_btl_fop (module, peer, address, op, operand, NULL);
return ompi_osc_rdma_lock_btl_fop (module, peer, address, op, operand, NULL, wait_for_completion);
}
pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t);
assert (NULL != pending_op);
OBJ_RETAIN(pending_op);
/* spin until the btl has accepted the operation */
do {
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, (intptr_t) address, peer->state_handle,
op, operand, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) &atomic_complete, NULL);
(void *) pending_op, NULL);
if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) {
break;
@ -100,14 +114,20 @@ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, omp
ompi_osc_rdma_progress (module);
} while (1);
if (OPAL_SUCCESS == ret) {
while (!atomic_complete) {
if (OPAL_SUCCESS != ret) {
/* need to release here because ompi_osc_rdma_atomic_complet was not called */
OBJ_RELEASE(pending_op);
if (OPAL_LIKELY(1 == ret)) {
ret = OMPI_SUCCESS;
}
} else if (wait_for_completion) {
while (!pending_op->op_complete) {
ompi_osc_rdma_progress (module);
}
} else if (1 == ret) {
ret = OMPI_SUCCESS;
}
OBJ_RELEASE(pending_op);
return ret;
}
@ -115,20 +135,26 @@ __opal_attribute_always_inline__
static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address,
ompi_osc_rdma_lock_t compare, ompi_osc_rdma_lock_t value, ompi_osc_rdma_lock_t *result)
{
volatile bool atomic_complete = false;
ompi_osc_rdma_frag_t *frag = NULL;
ompi_osc_rdma_lock_t *temp = NULL;
ompi_osc_rdma_pending_op_t *pending_op;
int ret;
pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t);
assert (NULL != pending_op);
OBJ_RETAIN(pending_op);
pending_op->op_result = (void *) result;
pending_op->op_size = sizeof (*result);
/* spin until the btl has accepted the operation */
do {
if (NULL == frag) {
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);
if (NULL == pending_op->op_frag) {
ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer);
}
if (NULL != frag) {
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, address, frag->handle,
peer->state_handle, compare, value, 0, 0, ompi_osc_rdma_atomic_complete,
(void *) &atomic_complete, NULL);
if (NULL != pending_op->op_frag) {
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, pending_op->op_buffer,
address, pending_op->op_frag->handle, peer->state_handle, compare,
value, 0, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL);
}
if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) {
@ -137,20 +163,19 @@ static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module,
ompi_osc_rdma_progress (module);
} while (1);
if (OPAL_SUCCESS == ret) {
while (!atomic_complete) {
if (OPAL_SUCCESS != ret) {
/* need to release here because ompi_osc_rdma_atomic_complet was not called */
OBJ_RELEASE(pending_op);
if (OPAL_LIKELY(1 == ret)) {
ret = OMPI_SUCCESS;
}
} else {
while (!pending_op->op_complete) {
ompi_osc_rdma_progress (module);
}
} else if (1 == ret) {
ret = OMPI_SUCCESS;
}
if (NULL != frag) {
if (*result) {
*result = *temp;
}
ompi_osc_rdma_frag_complete (frag);
}
OBJ_RELEASE(pending_op);
return ret;
}
@ -178,7 +203,7 @@ static inline int ompi_osc_rdma_lock_release_shared (ompi_osc_rdma_module_t *mod
peer->rank, (unsigned long) value);
if (!ompi_osc_rdma_peer_local_state (peer)) {
return ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, value);
return ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, value, false);
}
(void) ompi_osc_rdma_lock_add ((volatile ompi_osc_rdma_lock_t *) lock, value);
@ -215,7 +240,7 @@ static inline int ompi_osc_rdma_lock_acquire_shared (ompi_osc_rdma_module_t *mod
/* spin until the lock has been acquired */
if (!ompi_osc_rdma_peer_local_state (peer)) {
do {
ret = ompi_osc_rdma_lock_btl_fop (module, peer, lock, MCA_BTL_ATOMIC_ADD, value, &lock_state);
ret = ompi_osc_rdma_lock_btl_fop (module, peer, lock, MCA_BTL_ATOMIC_ADD, value, &lock_state, true);
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to increment shared lock. opal error code %d", ret);
return ret;
@ -339,7 +364,8 @@ static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t *
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d", lock, peer->rank);
if (!ompi_osc_rdma_peer_local_state (peer)) {
ret = ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE);
ret = ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE,
false);
} else {
ompi_osc_rdma_unlock_local ((volatile ompi_osc_rdma_lock_t *)(intptr_t) lock);
}

Просмотреть файл

@ -205,6 +205,19 @@ typedef struct ompi_osc_rdma_aggregation_t ompi_osc_rdma_aggregation_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_aggregation_t);
struct ompi_osc_rdma_pending_op_t {
opal_list_item_t super;
struct ompi_osc_rdma_frag_t *op_frag;
void *op_buffer;
void *op_result;
size_t op_size;
volatile bool op_complete;
};
typedef struct ompi_osc_rdma_pending_op_t ompi_osc_rdma_pending_op_t;
OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_op_t);
#define OSC_RDMA_VERBOSE(x, ...) OPAL_OUTPUT_VERBOSE((x, ompi_osc_base_framework.framework_output, __VA_ARGS__))
#endif /* OMPI_OSC_RDMA_TYPES_H */