osc/rdma: fix possible error when encountering accumulate lock contention
It is possible to get into a situation where a small accumulate operation can not be completed because a large accumulate operation holds the lock. In this case we may return from wait/flush/etc before the operation is complete. To handle this case increment the expected incoming fragment count when queuing an accumulate operation and increment the incoming fragment count after processing the accumulate operation. cmr=v1.8:reviewer=jsquyres This commit was SVN r31224.
Этот коммит содержится в:
родитель
3df85b47e9
Коммит
0d703759f6
@ -216,6 +216,7 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_t);
|
||||
|
||||
#define GET_MODULE(win) ((ompi_osc_rdma_module_t*) win->w_osc_module)
|
||||
|
||||
extern bool ompi_osc_rdma_no_locks;
|
||||
|
||||
int ompi_osc_rdma_attach(struct ompi_win_t *win, void *base, size_t len);
|
||||
int ompi_osc_rdma_detach(struct ompi_win_t *win, void *base);
|
||||
|
@ -708,11 +708,20 @@ static int ompi_osc_rdma_acc_op_queue (ompi_osc_rdma_module_t *module, ompi_osc_
|
||||
{
|
||||
osc_rdma_pending_acc_t *pending_acc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
|
||||
"%d: queuing accumulate operation", ompi_comm_size (module->comm)));
|
||||
|
||||
pending_acc = OBJ_NEW(osc_rdma_pending_acc_t);
|
||||
if (OPAL_UNLIKELY(NULL == pending_acc)) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
if (!ompi_osc_rdma_no_locks) {
|
||||
/* NTH: ensure we don't leave wait/process_flush/etc until this
|
||||
* accumulate operation is complete. */
|
||||
module->passive_incoming_frag_signal_count[source]++;
|
||||
}
|
||||
|
||||
pending_acc->source = source;
|
||||
|
||||
/* save any inline data (eager acc, gacc only) */
|
||||
@ -1122,6 +1131,11 @@ int ompi_osc_rdma_progress_pending_acc (ompi_osc_rdma_module_t *module)
|
||||
assert (0);
|
||||
}
|
||||
|
||||
if (!ompi_osc_rdma_no_locks) {
|
||||
/* signal that an operation is complete */
|
||||
mark_incoming_completion (module, pending_acc->source);
|
||||
}
|
||||
|
||||
pending_acc->data = NULL;
|
||||
OBJ_RELEASE(pending_acc);
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user