1
1

osc/rdma: fix bug in the active message code that could cause erroneous

results

The code to handle completion messages did not correctly increment the
number of expected messages. This could cause wait to return before all
incoming messages are complete.

I also added a check to ensure that start returns an error if we are in
a passive access epoch.

cmr=v1.8:reviewer=jsquyres

This commit was SVN r31203.
Этот коммит содержится в:
Nathan Hjelm 2014-03-25 15:28:36 +00:00
родитель be3fc7bf20
Коммит bc55276844
2 изменённых файлов: 16 добавлений и 16 удалений

Просмотреть файл

@ -152,13 +152,16 @@ ompi_osc_rdma_start(ompi_group_t *group,
int assert,
ompi_win_t *win)
{
int ret = OMPI_SUCCESS;
ompi_osc_rdma_module_t *module = GET_MODULE(win);
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"ompi_osc_rdma_start entering..."));
if (module->sc_group) {
OPAL_THREAD_LOCK(&module->lock);
/* ensure we're not already in a start */
if (NULL != module->sc_group || module->passive_target_access_epoch) {
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERR_RMA_SYNC;
}
@ -166,13 +169,6 @@ ompi_osc_rdma_start(ompi_group_t *group,
OBJ_RETAIN(group);
ompi_group_increment_proc_count(group);
OPAL_THREAD_LOCK(&module->lock);
/* ensure we're not already in a start */
if (NULL != module->sc_group) {
ret = OMPI_ERR_RMA_SYNC;
goto cleanup;
}
module->sc_group = group;
/* disable eager sends until we've receved the proper number of
@ -199,13 +195,6 @@ ompi_osc_rdma_start(ompi_group_t *group,
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_SUCCESS;
cleanup:
OPAL_THREAD_UNLOCK(&module->lock);
ompi_group_decrement_proc_count(group);
OBJ_RELEASE(group);
return ret;
}

Просмотреть файл

@ -1292,8 +1292,19 @@ static inline int process_complete (ompi_osc_rdma_module_t *module, int source,
{
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"osc rdma: process_complete got complete message from %d", source));
OPAL_THREAD_LOCK(&module->lock);
/* the current fragment is not part of the frag_count so we need to adjust for it */
module->active_incoming_frag_signal_count += complete_header->frag_count + 1;
module->num_complete_msgs++;
if (0 == module->num_complete_msgs) {
opal_condition_broadcast (&module->cond);
}
OPAL_THREAD_UNLOCK(&module->lock);
return sizeof (*complete_header);
}