fixes: 1127
fix some of the multi-threading problems for the cid allocation. Two bugs specifically: - since we do not have a queue for incoming fragments of unknown cid, we need to synchronize all processes before exiting the communicator creation. This synchronization was/is located in comm_activate, which was however too late for the multi-threaded case. Thus, for multi-threaded scenarios we are now synchronizing 'before' we allow another thread to enter the cid-allocation loop. - for synchronization, we used for the sake of simplicity allreduce operations. It turns out, that these operations interefered with the allreductions in the cid-allocation routine, which lead to non-sense results in the cid-allocation and potentially to endless loops. Multi-threaded communicator creation seems to work now, is however still 'very very' slow. I think, the busy wait of threads is killing the performance of the active threads in the cid allocation. But this is another topic. This commit was SVN r15910.
Этот коммит содержится в:
родитель
e333de3fc7
Коммит
0684002812
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2006-2007 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -107,6 +107,7 @@ OBJ_CLASS_INSTANCE (ompi_comm_reg_t,
|
||||
static opal_mutex_t ompi_cid_lock;
|
||||
static opal_list_t ompi_registered_comms;
|
||||
|
||||
|
||||
int ompi_comm_nextcid ( ompi_communicator_t* newcomm,
|
||||
ompi_communicator_t* comm,
|
||||
ompi_communicator_t* bridgecomm,
|
||||
@ -227,6 +228,13 @@ int ompi_comm_nextcid ( ompi_communicator_t* newcomm,
|
||||
newcomm->c_f_to_c_index = newcomm->c_contextid;
|
||||
ompi_pointer_array_set_item (&ompi_mpi_communicators, nextcid, newcomm);
|
||||
|
||||
/* for synchronization purposes, avoids receiving fragments for
|
||||
a communicator id, which might not yet been known. For single-threaded
|
||||
scenarios, this call is in ompi_comm_activate, for multi-threaded
|
||||
scenarios, it has to be already here ( before releasing another
|
||||
thread into the cid-allocation loop ) */
|
||||
(allredfnct)(&response, &glresponse, 1, MPI_MIN, comm, bridgecomm,
|
||||
local_leader, remote_leader, send_first );
|
||||
OPAL_THREAD_LOCK(&ompi_cid_lock);
|
||||
ompi_comm_unregister_cid (comm->c_contextid);
|
||||
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
|
||||
@ -426,8 +434,14 @@ int ompi_comm_activate ( ompi_communicator_t* newcomm,
|
||||
break;
|
||||
}
|
||||
|
||||
if (MPI_THREAD_MULTIPLE != ompi_mpi_thread_provided) {
|
||||
/* Only execute the synchronization for single-threaded scenarios.
|
||||
For multi-threaded cases, the synchronization has already
|
||||
been executed in the cid-allocation loop */
|
||||
(allredfnct)(&ok, &gok, 1, MPI_MIN, comm, bridgecomm,
|
||||
local_leader, remote_leader, send_first );
|
||||
|
||||
}
|
||||
}
|
||||
/* Check to see if this process is in the new communicator.
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user