1
1

fix race condition which can happen on finalize

1. Change in rte api implementation: now comm_world used to do p2p. 
This allows to not worry about other comms being destroyed.

2. added a notification mechanism with a help of which runtime can say libhcoll that RTE api can not be used any longer. 
pass a pointer to a flag, and its size to libhcoll.
The flag changes when the RTE is no longer available. 
Currently this flag is just ompi_mpi_finalized global bool value.

cmr=v1.7.3:reviewer=jladd

This commit was SVN r29331.
Этот коммит содержится в:
Mike Dubman 2013-10-02 13:38:47 +00:00
родитель ed62a3c7c8
Коммит 19748e6957
4 изменённых файлов: 13 добавлений и 6 удалений

Просмотреть файл

@ -42,6 +42,7 @@ typedef struct mca_coll_hcoll_ops_t {
int (*hcoll_barrier)(void *);
} mca_coll_hcoll_ops_t;
struct mca_coll_hcoll_component_t {
/** Base coll component */
mca_coll_base_component_2_0_0_t super;

Просмотреть файл

@ -229,6 +229,8 @@ static int hcoll_close(void)
int rc;
HCOL_VERBOSE(5,"HCOLL FINALIZE");
rc = hcoll_finalize();
opal_progress_unregister(hcoll_progress_fn);
if (HCOLL_SUCCESS != rc){
HCOL_VERBOSE(1,"Hcol library finalize failed");
@ -236,3 +238,4 @@ static int hcoll_close(void)
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -108,19 +108,22 @@ static int __save_coll_handlers(mca_coll_hcoll_module_t *hcoll_module)
* Initialize module on the communicator
*/
static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
struct ompi_communicator_t *comm)
struct ompi_communicator_t *comm)
{
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) module;
hcoll_module->comm = comm;
if (OMPI_SUCCESS != __save_coll_handlers(hcoll_module)){
HCOL_ERROR("coll_hcol: __save_coll_handlers failed");
return OMPI_ERROR;
}
hcoll_set_runtime_tag_offset(-100,mca_pml.pml_max_tag);
hcoll_set_rte_halt_flag_address(&ompi_mpi_finalized);
hcoll_set_rte_halt_flag_size(sizeof(ompi_mpi_finalized));
hcoll_module->hcoll_context =
hcoll_create_context((rte_grp_handle_t)comm);
hcoll_create_context((rte_grp_handle_t)comm);
if (NULL == hcoll_module->hcoll_context){
HCOL_VERBOSE(1,"hcoll_create_context returned NULL");
return OMPI_ERROR;

Просмотреть файл

@ -163,7 +163,7 @@ static int recv_nb(struct dte_data_representation_t data,
uint32_t tag,
rte_request_handle_t *req)
{
ompi_communicator_t *comm = (ompi_communicator_t *)grp_h;
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
if (NULL == ec_h.handle && -1 != ec_h.rank) {
fprintf(stderr,"***Error in hcolrte_rml_recv_nb: wrong null argument: "
@ -234,7 +234,7 @@ static int send_nb( dte_data_representation_t data,
uint32_t tag,
rte_request_handle_t *req)
{
ompi_communicator_t *comm = (ompi_communicator_t *)grp_h;
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
if (! ec_h.handle) {
fprintf(stderr,"***Error in hcolrte_rml_send_nb: wrong null argument: "
@ -329,7 +329,7 @@ static int get_ec_handles( int num_ec ,
ompi_communicator_t *comm = (ompi_communicator_t *)grp_h;
for (i=0; i<num_ec; i++){
ompi_proc_t *proc = ompi_comm_peer_lookup(comm,ec_indexes[i]);
ec_handles[i].rank = ec_indexes[i];
ec_handles[i].rank = proc->proc_name.vpid;
ec_handles[i].handle = (void *)proc;
}
return HCOLL_SUCCESS;
@ -375,7 +375,7 @@ static int group_id(rte_grp_handle_t group){
return ((ompi_communicator_t *)group)->c_contextid;
}
static int
static int
request_free(struct ompi_request_t **ompi_req)
{
ompi_request_t *req = *ompi_req;