1
1

osc/rdma: fix possible error when encountering accumulate lock contention

It is possible to get into a situation where a small accumulate operation
can not be completed because a large accumulate operation holds the lock.
In this case we may return from wait/flush/etc before the operation is
complete. To handle this case increment the expected incoming fragment
count when queuing an accumulate operation and increment the incoming
fragment count after processing the accumulate operation.

cmr=v1.8:reviewer=jsquyres

This commit was SVN r31224.
Этот коммит содержится в:
Nathan Hjelm 2014-03-25 21:00:43 +00:00
родитель 3df85b47e9
Коммит 0d703759f6
2 изменённых файлов: 15 добавлений и 0 удалений

Просмотреть файл

@ -216,6 +216,7 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_t);
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
extern bool ompi_osc_rdma_no_locks;
int ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len);
int ompi_osc_rdma_detach(struct ompi_win_t *win, void *base);

Просмотреть файл

@ -708,11 +708,20 @@ static int ompi_osc_rdma_acc_op_queue (ompi_osc_rdma_module_t *module, ompi_osc_
{
osc_rdma_pending_acc_t *pending_acc;
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"%d: queuing accumulate operation", ompi_comm_size (module->comm)));
pending_acc = OBJ_NEW(osc_rdma_pending_acc_t);
if (OPAL_UNLIKELY(NULL == pending_acc)) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (!ompi_osc_rdma_no_locks) {
/* NTH: ensure we don't leave wait/process_flush/etc until this
* accumulate operation is complete. */
module->passive_incoming_frag_signal_count[source]++;
}
pending_acc->source = source;
/* save any inline data (eager acc, gacc only) */
@ -1122,6 +1131,11 @@ int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module)
assert (0);
}
if (!ompi_osc_rdma_no_locks) {
/* signal that an operation is complete */
mark_incoming_completion (module, pending_acc->source);
}
pending_acc->data = NULL;
OBJ_RELEASE(pending_acc);