From 0fae2caef38080166696c5dee9fcf2d00b86d9c3 Mon Sep 17 00:00:00 2001 From: Mike Dubman Date: Thu, 9 Jan 2014 11:27:24 +0000 Subject: [PATCH] Create a comm keyval for hcoll component with delete callback function. Set comm attribute with keyval. Wait for pending hcoll module tasks in comm delete callback where PML still valid on the communicator. safely destroy hcoll context during hcoll module destructor. Author: Devendar Bureddy reviewed by miked cmr=v1.7.4:reviewer=ompi-rm1.7 This commit was SVN r30175. --- ompi/mca/coll/hcoll/coll_hcoll.h | 1 + ompi/mca/coll/hcoll/coll_hcoll_module.c | 51 +++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/ompi/mca/coll/hcoll/coll_hcoll.h b/ompi/mca/coll/hcoll/coll_hcoll.h index 23a2a67645..e379c59777 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll.h +++ b/ompi/mca/coll/hcoll/coll_hcoll.h @@ -19,6 +19,7 @@ #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/communicator/communicator.h" +#include "ompi/attribute/attribute.h" #include "ompi/op/op.h" #include "orte/runtime/orte_globals.h" diff --git a/ompi/mca/coll/hcoll/coll_hcoll_module.c b/ompi/mca/coll/hcoll/coll_hcoll_module.c index daa87d3bd6..1795009160 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll_module.c +++ b/ompi/mca/coll/hcoll/coll_hcoll_module.c @@ -10,6 +10,8 @@ #include "ompi_config.h" #include "coll_hcoll.h" +int hcoll_comm_attr_keyval; + /* * Initial query function that is invoked during MPI_INIT, allowing * this module to indicate what level of thread support it provides. @@ -57,6 +59,7 @@ static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module am = &mca_coll_hcoll_component.active_modules; if (hcoll_module->comm == &ompi_mpi_comm_world.comm){ +#if 0 /* If we get here then we are detroying MPI_COMM_WORLD now. So, * it is safe to destory all the other communicators and corresponding * hcoll contexts that could still be on the "active_modules" list. @@ -86,6 +89,10 @@ static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module (rte_grp_handle_t)hcoll_module->comm, &context_destroyed); } +#endif + if (OMPI_SUCCESS != ompi_attr_free_keyval(COMM_ATTR, &hcoll_comm_attr_keyval, 0)) { + HCOL_VERBOSE(1,"hcoll ompi_attr_free_keyval failed"); + } } OBJ_RELEASE(hcoll_module->previous_barrier_module); @@ -104,6 +111,11 @@ static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module OBJ_RELEASE(hcoll_module->previous_ibcast_module); OBJ_RELEASE(hcoll_module->previous_iallreduce_module); OBJ_RELEASE(hcoll_module->previous_iallgather_module); + context_destroyed = 0; + hcoll_destroy_context(hcoll_module->hcoll_context, + (rte_grp_handle_t)hcoll_module->comm, + &context_destroyed); + assert(context_destroyed); mca_coll_hcoll_module_clear(hcoll_module); } @@ -142,7 +154,18 @@ static int __save_coll_handlers(mca_coll_hcoll_module_t *hcoll_module) return OMPI_SUCCESS; } +/* +** Communicator free callback +*/ +int hcoll_comm_attr_del_fn(MPI_Comm comm, int keyval, void *attr_val, void *extra) +{ + mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) attr_val; + + hcoll_group_destroy_notify(hcoll_module->hcoll_context); + return OMPI_SUCCESS; + +} /* * Initialize module on the communicator */ @@ -150,6 +173,7 @@ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module, struct ompi_communicator_t *comm) { mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) module; + int ret; hcoll_module->comm = comm; if (OMPI_SUCCESS != __save_coll_handlers(hcoll_module)){ HCOL_ERROR("coll_hcol: __save_coll_handlers failed"); @@ -166,6 +190,7 @@ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module, return OMPI_ERROR; } +#if 0 if (comm != &ompi_mpi_comm_world.comm){ mca_coll_hcoll_module_list_item_wrapper_t *mw = OBJ_NEW(mca_coll_hcoll_module_list_item_wrapper_t); @@ -174,6 +199,13 @@ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module, opal_list_append(&mca_coll_hcoll_component.active_modules, (opal_list_item_t*)mw); } +#endif + + ret = ompi_attr_set_c(COMM_ATTR, comm, &comm->c_keyhash, hcoll_comm_attr_keyval, (void *)hcoll_module, false); + if (OMPI_SUCCESS != ret) { + HCOL_VERBOSE(1,"hcoll ompi_attr_set_c failed"); + return OMPI_ERROR; + } return OMPI_SUCCESS; } @@ -197,7 +229,7 @@ int mca_coll_hcoll_progress(void) if (ompi_mpi_finalized){ hcoll_rte_p2p_disabled_notify(); } - +#if 0 item = opal_list_get_first(am); while (item != opal_list_get_end(am)){ item_next = opal_list_get_next(item); @@ -225,10 +257,9 @@ int mca_coll_hcoll_progress(void) } item = item_next; } +#endif - if (!ompi_mpi_finalized) { - (*hcoll_progress_fn)(); - } + (*hcoll_progress_fn)(); OPAL_THREAD_ADD32(&mca_coll_hcoll_component.progress_lock,-1); return OMPI_SUCCESS; } @@ -245,6 +276,9 @@ mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority) mca_coll_base_module_t *module; mca_coll_hcoll_module_t *hcoll_module; static bool libhcoll_initialized = false; + ompi_attribute_fn_ptr_union_t del_fn; + ompi_attribute_fn_ptr_union_t copy_fn; + int err; int rc; *priority = 0; @@ -269,6 +303,15 @@ mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority) HCOL_VERBOSE(0,"Hcol library init failed"); return NULL; } + + copy_fn.attr_communicator_copy_fn = (MPI_Comm_internal_copy_attr_function*) MPI_COMM_NULL_COPY_FN; + del_fn.attr_communicator_delete_fn = hcoll_comm_attr_del_fn; + err = ompi_attr_create_keyval(COMM_ATTR, copy_fn, del_fn, &hcoll_comm_attr_keyval, NULL ,0, NULL); + if (OMPI_SUCCESS != err) { + HCOL_VERBOSE(0,"Hcol comm keyval create failed"); + return NULL; + } + libhcoll_initialized = true; } hcoll_module = OBJ_NEW(mca_coll_hcoll_module_t);