1
1

osc/rdma: check for more types of window access violations

This commit adds a check to see if the target is in an access epoch. If
not we return OMPI_ERR_RMA_SYNC. This fixes test_start3 in the onesided
test suite. The cost of this extra check is 1 byte/peer for the boolean
flag indicating that the peer is in an access epoch.

I also fixed a problem where mupliple unexpected post messages are not
correctly handled.

cmr=v1.8.2:reviewer=jsquyres

This commit was SVN r32160.
Этот коммит содержится в:
Nathan Hjelm 2014-07-08 21:11:12 +00:00
родитель d63cf04d2e
Коммит b6abe68972
4 изменённых файлов: 107 добавлений и 16 удалений

Просмотреть файл

@ -88,6 +88,7 @@ struct ompi_osc_rdma_peer_t {
/** Number of acks pending. New requests can not be sent out if there are
* acks pending (to fulfill the ordering constraints of accumulate) */
uint32_t num_acks_pending;
bool access_epoch;
};
typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t;
@ -166,6 +167,9 @@ struct ompi_osc_rdma_module_t {
/** start sending data eagerly */
bool active_eager_send_active;
/** Indicates the window is in an all access epoch (fence, lock_all) */
bool all_access_epoch;
bool *passive_eager_send_active;
/* ********************* PWSC data ************************ */
@ -690,6 +694,11 @@ static inline void ompi_osc_rdma_accumulate_unlock (ompi_osc_rdma_module_t *modu
}
}
static inline bool ompi_osc_rdma_check_access_epoch (ompi_osc_rdma_module_t *module, int rank)
{
return module->all_access_epoch || module->peers[rank].access_epoch;
}
END_C_DECLS
#endif /* OMPI_OSC_RDMA_H */

Просмотреть файл

@ -117,6 +117,7 @@ ompi_osc_rdma_fence(int assert, ompi_win_t *win)
/* active sends are now active (we will close the epoch if NOSUCCEED is specified) */
if (0 == (assert & MPI_MODE_NOSUCCEED)) {
module->active_eager_send_active = true;
module->all_access_epoch = true;
}
/* short-circuit the noprecede case */
@ -166,7 +167,8 @@ ompi_osc_rdma_fence(int assert, ompi_win_t *win)
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
* stated that MPI_MODE_NOSUCCEED ends the epoch but it is a safe assumption. */
module->active_eager_send_active = false;
}
module->all_access_epoch = false;
}
cleanup:
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
@ -185,13 +187,14 @@ ompi_osc_rdma_start(ompi_group_t *group,
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_pending_post_t *pending_post, *next;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start entering..."));
ompi_osc_rdma_peer_t *peer;
int group_size;
int *ranks;
OPAL_THREAD_LOCK(&module->lock);
/* ensure we're not already in a start */
/* ensure we're not already in a start or passive target. we can no check for all
* access here due to fence */
if (NULL != module->sc_group || module->passive_target_access_epoch) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
@ -203,14 +206,32 @@ ompi_osc_rdma_start(ompi_group_t *group,
module->sc_group = group;
/* mark all procs in this group as being in an access epoch */
group_size = ompi_group_size (module->sc_group);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start entering with group size %d...",
group_size));
ranks = get_comm_ranks(module, module->sc_group);
if (NULL == ranks) return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
for (int i = 0 ; i < group_size ; ++i) {
/* when the post comes in we will be in an access epoch with this proc */
module->peers[ranks[i]].access_epoch = true;
}
free (ranks);
OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_rdma_pending_post_t) {
ompi_proc_t *pending_proc = ompi_comm_peer_lookup (module->comm, pending_post->rank);
if (group_contains_proc (module->sc_group, pending_proc)) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "Consumed unexpected post message from %d",
pending_post->rank));
++module->num_post_msgs;
opal_list_remove_item (&module->pending_posts, &pending_post->super);
OBJ_RELEASE(pending_post);
break;
}
}
@ -219,13 +240,13 @@ ompi_osc_rdma_start(ompi_group_t *group,
receive messages. */
module->active_eager_send_active = false;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"num_post_msgs = %d", module->num_post_msgs));
/* possible we've already received a couple in messages, so
add however many we're going to wait for */
module->num_post_msgs -= ompi_group_size(module->sc_group);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"num_post_msgs = %d", module->num_post_msgs));
/* if we've already received all the post messages, we can eager
send. Otherwise, eager send will be enabled when
numb_post_messages reaches 0 */
@ -246,10 +267,12 @@ ompi_osc_rdma_complete(ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_header_complete_t complete_req;
ompi_osc_rdma_peer_t *peer;
int ret = OMPI_SUCCESS;
int i;
int *ranks = NULL;
ompi_group_t *group;
int my_rank = ompi_comm_rank (module->comm);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_complete entering..."));
@ -282,10 +305,21 @@ ompi_osc_rdma_complete(ompi_win_t *win)
round. */
OPAL_THREAD_UNLOCK(&module->lock);
for (i = 0 ; i < ompi_group_size(module->sc_group) ; ++i) {
if (my_rank == ranks[i]) {
/* shortcut for self */
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self complete"));
module->num_complete_msgs++;
continue;
}
complete_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_COMPLETE;
complete_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
complete_req.frag_count = module->epoch_outgoing_frag_count[ranks[i]];
peer = module->peers + ranks[i];
peer->access_epoch = false;
ret = ompi_osc_rdma_control_send(module,
ranks[i],
&complete_req,
@ -344,14 +378,17 @@ ompi_osc_rdma_post(ompi_group_t *group,
int ret = OMPI_SUCCESS;
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_header_post_t post_req;
int my_rank = ompi_comm_rank(module->comm);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_post entering..."));
/* can't check for all access epoch here due to fence */
if (module->pw_group) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_post entering with group size %d...",
ompi_group_size (group)));
/* save the group */
OBJ_RETAIN(group);
ompi_group_increment_proc_count(group);
@ -382,6 +419,15 @@ ompi_osc_rdma_post(ompi_group_t *group,
/* send a hello counter to everyone in group */
for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "Sending post message to rank %d", ranks[i]));
/* shortcut for self */
if (my_rank == ranks[i]) {
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_complete self post"));
osc_rdma_incoming_post (module, my_rank);
continue;
}
post_req.base.type = OMPI_OSC_RDMA_HDR_TYPE_POST;
post_req.base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID;
post_req.windx = ompi_comm_get_cid(module->comm);
@ -407,16 +453,19 @@ ompi_osc_rdma_wait(ompi_win_t *win)
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_group_t *group;
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait entering..."));
if (NULL == module->pw_group) {
return OMPI_ERR_RMA_SYNC;
}
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_wait entering..."));
OPAL_THREAD_LOCK(&module->lock);
while (0 != module->num_complete_msgs ||
module->active_incoming_frag_count < module->active_incoming_frag_signal_count) {
OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output,
"num_complete_msgs = %d, active_incoming_frag_count = %d, active_incoming_frag_signal_count = %d",
module->num_complete_msgs, module->active_incoming_frag_count, module->active_incoming_frag_signal_count));
opal_condition_wait(&module->cond, &module->lock);
}
@ -487,6 +536,10 @@ int osc_rdma_incoming_post (ompi_osc_rdma_module_t *module, int source)
if (!module->sc_group || !group_contains_proc (module->sc_group, source_proc)) {
ompi_osc_rdma_pending_post_t *pending_post = OBJ_NEW(ompi_osc_rdma_pending_post_t);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"received unexpected post message from %d. module->sc_group = %p, size = %d",
source, module->sc_group, module->sc_group ? ompi_group_size (module->sc_group) : 0));
pending_post->rank = source;
opal_list_append (&module->pending_posts, &pending_post->super);

Просмотреть файл

@ -301,6 +301,10 @@ static inline int ompi_osc_rdma_put_w_req (void *origin_addr, int origin_count,
origin_dt->name, target, (int) target_disp,
target_count, target_dt->name, win->w_name));
if (!ompi_osc_rdma_check_access_epoch (module, target)) {
return OMPI_ERR_RMA_SYNC;
}
/* short-circuit case */
if (0 == origin_count || 0 == target_count) {
if (request) {
@ -473,6 +477,10 @@ ompi_osc_rdma_accumulate_w_req (void *origin_addr, int origin_count,
target_count, target_dt->name, op->o_name,
win->w_name));
if (!ompi_osc_rdma_check_access_epoch (module, target)) {
return OMPI_ERR_RMA_SYNC;
}
/* short-circuit case */
if (0 == origin_count || 0 == target_count) {
if (request) {
@ -643,6 +651,10 @@ int ompi_osc_rdma_compare_and_swap (void *origin_addr, void *compare_addr,
(unsigned long) result_addr, dt->name, target, (int) target_disp,
win->w_name));
if (!ompi_osc_rdma_check_access_epoch (module, target)) {
return OMPI_ERR_RMA_SYNC;
}
/* optimize self case. TODO: optimize local case */
if (ompi_comm_rank (module->comm) == target) {
return ompi_osc_rdma_cas_self (origin_addr, compare_addr, result_addr, dt, target_disp,
@ -788,6 +800,10 @@ static inline int ompi_osc_rdma_rget_internal (void *origin_addr, int origin_cou
origin_dt->name, target, (int) target_disp,
target_count, target_dt->name, win->w_name));
if (!ompi_osc_rdma_check_access_epoch (module, target)) {
return OMPI_ERR_RMA_SYNC;
}
/* gets are always request based, so that we know where to land the data */
OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request);
if (NULL == rdma_request) {
@ -997,6 +1013,10 @@ int ompi_osc_rdma_rget_accumulate_internal (void *origin_addr, int origin_count,
target_rank, (int) target_disp, target_count, target_datatype->name,
op->o_name, win->w_name));
if (!ompi_osc_rdma_check_access_epoch (module, target_rank)) {
return OMPI_ERR_RMA_SYNC;
}
/* get_accumulates are always request based, so that we know where to land the data */
OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request);
if (OPAL_UNLIKELY(NULL == rdma_request)) {

Просмотреть файл

@ -185,11 +185,12 @@ int ompi_osc_rdma_lock(int lock_type, int target, int assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock;
ompi_osc_rdma_peer_t *peer = module->peers + target;
int ret = OMPI_SUCCESS;
/* Check if no_locks is set. TODO: we also need to track whether we are in an
* active target epoch. Fence can make this tricky to track. */
if (NULL == module->passive_eager_send_active) {
if (NULL == module->passive_eager_send_active || module->sc_group) {
return OMPI_ERR_RMA_SYNC;
}
@ -203,6 +204,9 @@ int ompi_osc_rdma_lock(int lock_type, int target, int assert, ompi_win_t *win)
module->passive_eager_send_active[target] = false;
module->passive_target_access_epoch = true;
/* when the lock ack returns we will be in an access epoch with this peer */
peer->access_epoch = true;
/* create lock item */
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
if (OPAL_UNLIKELY(NULL == lock)) {
@ -249,6 +253,7 @@ int ompi_osc_rdma_unlock(int target, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_outstanding_lock_t *lock = NULL;
ompi_osc_rdma_peer_t *peer = module->peers + target;
int ret = OMPI_SUCCESS;
OPAL_THREAD_LOCK(&module->lock);
@ -299,6 +304,8 @@ int ompi_osc_rdma_unlock(int target, ompi_win_t *win)
module->epoch_outgoing_frag_count[target] = 0;
module->passive_target_access_epoch = false;
peer->access_epoch = false;
/* delete the lock */
opal_list_remove_item (&module->outstanding_locks, &lock->super);
OBJ_RELEASE(lock);
@ -328,6 +335,7 @@ int ompi_osc_rdma_lock_all(int assert, struct ompi_win_t *win)
module->passive_eager_send_active[i] = false;
}
module->passive_target_access_epoch = true;
module->all_access_epoch = true;
/* create lock item */
lock = OBJ_NEW(ompi_osc_rdma_outstanding_lock_t);
@ -434,6 +442,7 @@ int ompi_osc_rdma_unlock_all (struct ompi_win_t *win)
OBJ_RELEASE(lock);
module->passive_target_access_epoch = false;
module->all_access_epoch = false;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_unlock_all complete"));