fix race condition which can happen on finalize
1. Change in rte api implementation: now comm_world used to do p2p. This allows to not worry about other comms being destroyed. 2. added a notification mechanism with a help of which runtime can say libhcoll that RTE api can not be used any longer. pass a pointer to a flag, and its size to libhcoll. The flag changes when the RTE is no longer available. Currently this flag is just ompi_mpi_finalized global bool value. cmr=v1.7.3:reviewer=jladd This commit was SVN r29331.
Этот коммит содержится в:
родитель
ed62a3c7c8
Коммит
19748e6957
@ -42,6 +42,7 @@ typedef struct mca_coll_hcoll_ops_t {
|
||||
int (*hcoll_barrier)(void *);
|
||||
} mca_coll_hcoll_ops_t;
|
||||
|
||||
|
||||
struct mca_coll_hcoll_component_t {
|
||||
/** Base coll component */
|
||||
mca_coll_base_component_2_0_0_t super;
|
||||
|
@ -229,6 +229,8 @@ static int hcoll_close(void)
|
||||
int rc;
|
||||
HCOL_VERBOSE(5,"HCOLL FINALIZE");
|
||||
rc = hcoll_finalize();
|
||||
|
||||
|
||||
opal_progress_unregister(hcoll_progress_fn);
|
||||
if (HCOLL_SUCCESS != rc){
|
||||
HCOL_VERBOSE(1,"Hcol library finalize failed");
|
||||
@ -236,3 +238,4 @@ static int hcoll_close(void)
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -108,19 +108,22 @@ static int __save_coll_handlers(mca_coll_hcoll_module_t *hcoll_module)
|
||||
* Initialize module on the communicator
|
||||
*/
|
||||
static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
|
||||
struct ompi_communicator_t *comm)
|
||||
struct ompi_communicator_t *comm)
|
||||
{
|
||||
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) module;
|
||||
hcoll_module->comm = comm;
|
||||
|
||||
if (OMPI_SUCCESS != __save_coll_handlers(hcoll_module)){
|
||||
HCOL_ERROR("coll_hcol: __save_coll_handlers failed");
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
hcoll_set_runtime_tag_offset(-100,mca_pml.pml_max_tag);
|
||||
hcoll_set_rte_halt_flag_address(&ompi_mpi_finalized);
|
||||
hcoll_set_rte_halt_flag_size(sizeof(ompi_mpi_finalized));
|
||||
|
||||
hcoll_module->hcoll_context =
|
||||
hcoll_create_context((rte_grp_handle_t)comm);
|
||||
hcoll_create_context((rte_grp_handle_t)comm);
|
||||
if (NULL == hcoll_module->hcoll_context){
|
||||
HCOL_VERBOSE(1,"hcoll_create_context returned NULL");
|
||||
return OMPI_ERROR;
|
||||
|
@ -163,7 +163,7 @@ static int recv_nb(struct dte_data_representation_t data,
|
||||
uint32_t tag,
|
||||
rte_request_handle_t *req)
|
||||
{
|
||||
ompi_communicator_t *comm = (ompi_communicator_t *)grp_h;
|
||||
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
|
||||
|
||||
if (NULL == ec_h.handle && -1 != ec_h.rank) {
|
||||
fprintf(stderr,"***Error in hcolrte_rml_recv_nb: wrong null argument: "
|
||||
@ -234,7 +234,7 @@ static int send_nb( dte_data_representation_t data,
|
||||
uint32_t tag,
|
||||
rte_request_handle_t *req)
|
||||
{
|
||||
ompi_communicator_t *comm = (ompi_communicator_t *)grp_h;
|
||||
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
|
||||
|
||||
if (! ec_h.handle) {
|
||||
fprintf(stderr,"***Error in hcolrte_rml_send_nb: wrong null argument: "
|
||||
@ -329,7 +329,7 @@ static int get_ec_handles( int num_ec ,
|
||||
ompi_communicator_t *comm = (ompi_communicator_t *)grp_h;
|
||||
for (i=0; i<num_ec; i++){
|
||||
ompi_proc_t *proc = ompi_comm_peer_lookup(comm,ec_indexes[i]);
|
||||
ec_handles[i].rank = ec_indexes[i];
|
||||
ec_handles[i].rank = proc->proc_name.vpid;
|
||||
ec_handles[i].handle = (void *)proc;
|
||||
}
|
||||
return HCOLL_SUCCESS;
|
||||
@ -375,7 +375,7 @@ static int group_id(rte_grp_handle_t group){
|
||||
return ((ompi_communicator_t *)group)->c_contextid;
|
||||
}
|
||||
|
||||
static int
|
||||
static int
|
||||
request_free(struct ompi_request_t **ompi_req)
|
||||
{
|
||||
ompi_request_t *req = *ompi_req;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user