osc/rdma: fix bug in the active message code that could cause erroneous
results The code to handle completion messages did not correctly increment the number of expected messages. This could cause wait to return before all incoming messages are complete. I also added a check to ensure that start returns an error if we are in a passive access epoch. cmr=v1.8:reviewer=jsquyres This commit was SVN r31203.
Этот коммит содержится в:
родитель
be3fc7bf20
Коммит
bc55276844
@ -152,13 +152,16 @@ ompi_osc_rdma_start(ompi_group_t *group,
|
||||
int assert,
|
||||
ompi_win_t *win)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
ompi_osc_rdma_module_t *module = GET_MODULE(win);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"ompi_osc_rdma_start entering..."));
|
||||
|
||||
if (module->sc_group) {
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* ensure we're not already in a start */
|
||||
if (NULL != module->sc_group || module->passive_target_access_epoch) {
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_ERR_RMA_SYNC;
|
||||
}
|
||||
|
||||
@ -166,13 +169,6 @@ ompi_osc_rdma_start(ompi_group_t *group,
|
||||
OBJ_RETAIN(group);
|
||||
ompi_group_increment_proc_count(group);
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* ensure we're not already in a start */
|
||||
if (NULL != module->sc_group) {
|
||||
ret = OMPI_ERR_RMA_SYNC;
|
||||
goto cleanup;
|
||||
}
|
||||
module->sc_group = group;
|
||||
|
||||
/* disable eager sends until we've receved the proper number of
|
||||
@ -199,13 +195,6 @@ ompi_osc_rdma_start(ompi_group_t *group,
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
return OMPI_SUCCESS;
|
||||
|
||||
cleanup:
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
ompi_group_decrement_proc_count(group);
|
||||
OBJ_RELEASE(group);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1292,8 +1292,19 @@ static inline int process_complete (ompi_osc_rdma_module_t *module, int source,
|
||||
{
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"osc rdma: process_complete got complete message from %d", source));
|
||||
|
||||
OPAL_THREAD_LOCK(&module->lock);
|
||||
|
||||
/* the current fragment is not part of the frag_count so we need to adjust for it */
|
||||
module->active_incoming_frag_signal_count += complete_header->frag_count + 1;
|
||||
module->num_complete_msgs++;
|
||||
|
||||
if (0 == module->num_complete_msgs) {
|
||||
opal_condition_broadcast (&module->cond);
|
||||
}
|
||||
|
||||
OPAL_THREAD_UNLOCK(&module->lock);
|
||||
|
||||
return sizeof (*complete_header);
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user