HCOLL: many fixes
Adds coll_hcoll_np mca parameter similar to that of fca component (defaults to 32). Those who use hcoll be aware that from now on the communicators less than 32 procs will run w/o hcoll by default. - Resolves fallback issue in case libhcoll runs out of allowed contexts. The solution is moving hcoll_context_create from comm_enable to comm_query. Shortly, comm_enable should never return OMPI_ERROR in the coll component with highest priority (hcoll). Otherwise the ompi coll_base_select will unselect the coll funtion pointers and module references leaving the communicator w/o coll pointer. This will cause the fail. Same behavior can be reproduced even with tuned if one would hardcore some "return OMPI_ERROR" into it's module_enable funtion. - Additionally, removed all the dead code under #if 0; removed unused variables (path for library, active_modules list) and classes (module list wrapper) Fixed by Val, Reviewed by Devendar/Josh/Miked cmr=v1.7.4:reviewer=ompi-rm1.7 This commit was SVN r30341.
Этот коммит содержится в:
родитель
37343574e0
Коммит
b8550a55a7
@ -54,21 +54,22 @@ struct mca_coll_hcoll_component_t {
|
|||||||
/** MCA parameter: Verbose level of this component */
|
/** MCA parameter: Verbose level of this component */
|
||||||
int hcoll_verbose;
|
int hcoll_verbose;
|
||||||
|
|
||||||
/** MCA parameter: Path to libfca.so */
|
|
||||||
char* hcoll_lib_path;
|
|
||||||
|
|
||||||
/** MCA parameter: Enable FCA */
|
/** MCA parameter: Enable FCA */
|
||||||
int hcoll_enable;
|
int hcoll_enable;
|
||||||
|
|
||||||
|
/** MCA parameter: Minimal number of processes in the communicator
|
||||||
|
for the corresponding hcoll context to be created */
|
||||||
|
int hcoll_np;
|
||||||
|
|
||||||
|
/** Whether or not hcoll_init was ever called */
|
||||||
|
bool libhcoll_initialized;
|
||||||
|
|
||||||
/** MCA parameter: ON/OFF user defined datatype through HCOLL */
|
/** MCA parameter: ON/OFF user defined datatype through HCOLL */
|
||||||
int hcoll_datatype_fallback;
|
int hcoll_datatype_fallback;
|
||||||
|
|
||||||
/* FCA global stuff */
|
/* FCA global stuff */
|
||||||
void *hcoll_lib_handle; /* FCA dynamic library */
|
|
||||||
mca_coll_hcoll_ops_t hcoll_ops;
|
mca_coll_hcoll_ops_t hcoll_ops;
|
||||||
ompi_free_list_t requests;
|
ompi_free_list_t requests;
|
||||||
opal_list_t active_modules;
|
|
||||||
volatile uint32_t progress_lock;
|
|
||||||
};
|
};
|
||||||
typedef struct mca_coll_hcoll_component_t mca_coll_hcoll_component_t;
|
typedef struct mca_coll_hcoll_component_t mca_coll_hcoll_component_t;
|
||||||
|
|
||||||
@ -126,13 +127,6 @@ OBJ_CLASS_DECLARATION(mca_coll_hcoll_module_t);
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct mca_coll_hcoll_module_list_item_wrapper_t{
|
|
||||||
opal_list_item_t super;
|
|
||||||
mca_coll_hcoll_module_t *module;
|
|
||||||
} mca_coll_hcoll_module_list_item_wrapper_t;
|
|
||||||
|
|
||||||
OBJ_CLASS_DECLARATION(mca_coll_hcoll_module_list_item_wrapper_t);
|
|
||||||
|
|
||||||
|
|
||||||
/* API functions */
|
/* API functions */
|
||||||
int mca_coll_hcoll_init_query(bool enable_progress_threads, bool enable_mpi_threads);
|
int mca_coll_hcoll_init_query(bool enable_progress_threads, bool enable_mpi_threads);
|
||||||
|
@ -56,10 +56,9 @@ mca_coll_hcoll_component_t mca_coll_hcoll_component = {
|
|||||||
mca_coll_hcoll_init_query,
|
mca_coll_hcoll_init_query,
|
||||||
mca_coll_hcoll_comm_query,
|
mca_coll_hcoll_comm_query,
|
||||||
},
|
},
|
||||||
90,
|
90, /* priority */
|
||||||
0,
|
0, /* verbose level */
|
||||||
"",
|
1 /* hcoll_enable */
|
||||||
1
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -74,11 +73,6 @@ int mca_coll_hcoll_get_lib(void)
|
|||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mca_coll_hcoll_close_lib(void)
|
|
||||||
{
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* * Local flags
|
* * Local flags
|
||||||
* */
|
* */
|
||||||
@ -96,7 +90,6 @@ enum {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* utility routine for string parameter registration
|
* utility routine for string parameter registration
|
||||||
*/
|
*/
|
||||||
@ -131,7 +124,6 @@ static int reg_string(const char* param_name,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Utility routine for integer parameter registration
|
* Utility routine for integer parameter registration
|
||||||
*/
|
*/
|
||||||
@ -177,42 +169,43 @@ static int hcoll_register(void)
|
|||||||
|
|
||||||
ret = OMPI_SUCCESS;
|
ret = OMPI_SUCCESS;
|
||||||
|
|
||||||
#define CHECK(expr) do { \
|
#define CHECK(expr) do { \
|
||||||
tmp = (expr); \
|
tmp = (expr); \
|
||||||
if (OMPI_SUCCESS != tmp) ret = tmp; \
|
if (OMPI_SUCCESS != tmp) ret = tmp; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
CHECK(reg_int("priority",NULL,
|
CHECK(reg_int("priority",NULL,
|
||||||
"Priority of the hcol coll component",
|
"Priority of the hcol coll component",
|
||||||
90,
|
90,
|
||||||
&mca_coll_hcoll_component.hcoll_priority,
|
&mca_coll_hcoll_component.hcoll_priority,
|
||||||
0));
|
0));
|
||||||
|
|
||||||
CHECK(reg_int("verbose", NULL,
|
CHECK(reg_int("verbose", NULL,
|
||||||
"Verbose level of the hcol coll component",
|
"Verbose level of the hcol coll component",
|
||||||
0,
|
0,
|
||||||
&mca_coll_hcoll_component.hcoll_verbose,
|
&mca_coll_hcoll_component.hcoll_verbose,
|
||||||
0));
|
0));
|
||||||
|
|
||||||
CHECK(reg_int("enable",NULL,
|
CHECK(reg_int("enable",NULL,
|
||||||
"[1|0|] Enable/Disable HCOL",
|
"[1|0|] Enable/Disable HCOL",
|
||||||
1 /*enable by default*/,
|
1 /*enable by default*/,
|
||||||
&mca_coll_hcoll_component.hcoll_enable,
|
&mca_coll_hcoll_component.hcoll_enable,
|
||||||
0));
|
0));
|
||||||
|
|
||||||
|
CHECK(reg_int("np",NULL,
|
||||||
|
"Minimal number of processes in the communicator"
|
||||||
|
" for the corresponding hcoll context to be created (default: 32)",
|
||||||
|
2 /*enable by default*/,
|
||||||
|
&mca_coll_hcoll_component.hcoll_np,
|
||||||
|
0));
|
||||||
|
|
||||||
CHECK(reg_int("datatype_fallback",NULL,
|
CHECK(reg_int("datatype_fallback",NULL,
|
||||||
"[1|0|] Enable/Disable user defined dattypes fallback",
|
"[1|0|] Enable/Disable user defined dattypes fallback",
|
||||||
1 /*enable by default*/,
|
1 /*enable by default*/,
|
||||||
&mca_coll_hcoll_component.hcoll_datatype_fallback,
|
&mca_coll_hcoll_component.hcoll_datatype_fallback,
|
||||||
0));
|
0));
|
||||||
|
|
||||||
CHECK(reg_string("library_path", NULL,
|
|
||||||
"HCOL /path/to/libhcol.so",
|
|
||||||
""COLL_HCOLL_HOME"/libhcol.so",
|
|
||||||
&mca_coll_hcoll_component.hcoll_lib_path,
|
|
||||||
0));
|
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -220,23 +213,25 @@ static int hcoll_register(void)
|
|||||||
static int hcoll_open(void)
|
static int hcoll_open(void)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
mca_coll_hcoll_component_t *cm;
|
||||||
|
cm = &mca_coll_hcoll_component;
|
||||||
|
|
||||||
mca_coll_hcoll_output = opal_output_open(NULL);
|
mca_coll_hcoll_output = opal_output_open(NULL);
|
||||||
opal_output_set_verbosity(mca_coll_hcoll_output, mca_coll_hcoll_component.hcoll_verbose);
|
opal_output_set_verbosity(mca_coll_hcoll_output, cm->hcoll_verbose);
|
||||||
|
|
||||||
hcoll_rte_fns_setup();
|
hcoll_rte_fns_setup();
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&mca_coll_hcoll_component.active_modules, opal_list_t);
|
cm->libhcoll_initialized = false;
|
||||||
|
|
||||||
mca_coll_hcoll_component.progress_lock = -1;
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int hcoll_close(void)
|
static int hcoll_close(void)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
mca_coll_hcoll_component_t *cm;
|
||||||
|
cm = &mca_coll_hcoll_component;
|
||||||
|
|
||||||
if (false == mca_coll_hcoll_component.hcoll_enable) {
|
if (false == cm->libhcoll_initialized) {
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -244,8 +239,6 @@ static int hcoll_close(void)
|
|||||||
rc = hcoll_finalize();
|
rc = hcoll_finalize();
|
||||||
|
|
||||||
opal_progress_unregister(mca_coll_hcoll_progress);
|
opal_progress_unregister(mca_coll_hcoll_progress);
|
||||||
OBJ_DESTRUCT(&mca_coll_hcoll_component.active_modules);
|
|
||||||
memset(&mca_coll_hcoll_component.active_modules,0,sizeof(mca_coll_hcoll_component.active_modules));
|
|
||||||
if (HCOLL_SUCCESS != rc){
|
if (HCOLL_SUCCESS != rc){
|
||||||
HCOL_VERBOSE(1,"Hcol library finalize failed");
|
HCOL_VERBOSE(1,"Hcol library finalize failed");
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
|
@ -18,7 +18,6 @@ int hcoll_comm_attr_keyval;
|
|||||||
*/
|
*/
|
||||||
int mca_coll_hcoll_init_query(bool enable_progress_threads, bool enable_mpi_threads)
|
int mca_coll_hcoll_init_query(bool enable_progress_threads, bool enable_mpi_threads)
|
||||||
{
|
{
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -50,73 +49,45 @@ static void mca_coll_hcoll_module_construct(mca_coll_hcoll_module_t *hcoll_modul
|
|||||||
|
|
||||||
static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module)
|
static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module)
|
||||||
{
|
{
|
||||||
opal_list_item_t *item, *item_next;
|
|
||||||
opal_list_t *am;
|
|
||||||
mca_coll_hcoll_module_t *module;
|
mca_coll_hcoll_module_t *module;
|
||||||
ompi_communicator_t *comm;
|
ompi_communicator_t *comm;
|
||||||
int context_destroyed;
|
int context_destroyed;
|
||||||
|
|
||||||
am = &mca_coll_hcoll_component.active_modules;
|
|
||||||
|
|
||||||
if (hcoll_module->comm == &ompi_mpi_comm_world.comm){
|
if (hcoll_module->comm == &ompi_mpi_comm_world.comm){
|
||||||
#if 0
|
|
||||||
/* If we get here then we are detroying MPI_COMM_WORLD now. So,
|
|
||||||
* it is safe to destory all the other communicators and corresponding
|
|
||||||
* hcoll contexts that could still be on the "active_modules" list.
|
|
||||||
*/
|
|
||||||
item = opal_list_get_first(am);
|
|
||||||
while (item != opal_list_get_end(am)){
|
|
||||||
item_next = opal_list_get_next(item);
|
|
||||||
module = ((mca_coll_hcoll_module_list_item_wrapper_t *)item)->module;
|
|
||||||
comm = module->comm;
|
|
||||||
context_destroyed = 0;
|
|
||||||
while(!context_destroyed){
|
|
||||||
hcoll_destroy_context(module->hcoll_context,
|
|
||||||
(rte_grp_handle_t)comm,
|
|
||||||
&context_destroyed);
|
|
||||||
}
|
|
||||||
module->hcoll_context = NULL;
|
|
||||||
OBJ_RELEASE(comm);
|
|
||||||
opal_list_remove_item(am,item);
|
|
||||||
OBJ_RELEASE(item);
|
|
||||||
item = item_next;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now destory the comm_world hcoll context as well */
|
|
||||||
context_destroyed = 0;
|
|
||||||
while(!context_destroyed){
|
|
||||||
hcoll_destroy_context(hcoll_module->hcoll_context,
|
|
||||||
(rte_grp_handle_t)hcoll_module->comm,
|
|
||||||
&context_destroyed);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if (OMPI_SUCCESS != ompi_attr_free_keyval(COMM_ATTR, &hcoll_comm_attr_keyval, 0)) {
|
if (OMPI_SUCCESS != ompi_attr_free_keyval(COMM_ATTR, &hcoll_comm_attr_keyval, 0)) {
|
||||||
HCOL_VERBOSE(1,"hcoll ompi_attr_free_keyval failed");
|
HCOL_VERBOSE(1,"hcoll ompi_attr_free_keyval failed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_barrier_module);
|
/* If the hcoll_context is null then we are destroying the hcoll_module
|
||||||
OBJ_RELEASE(hcoll_module->previous_bcast_module);
|
that didn't initialized fallback colls/modules.
|
||||||
OBJ_RELEASE(hcoll_module->previous_reduce_module);
|
Then just clear and return. Otherwise release module pointers and
|
||||||
OBJ_RELEASE(hcoll_module->previous_allreduce_module);
|
destroy hcoll context*/
|
||||||
OBJ_RELEASE(hcoll_module->previous_allgather_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_allgatherv_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_gather_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_gatherv_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_alltoall_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_alltoallv_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_alltoallw_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_reduce_scatter_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_ibarrier_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_ibcast_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_iallreduce_module);
|
|
||||||
OBJ_RELEASE(hcoll_module->previous_iallgather_module);
|
|
||||||
context_destroyed = 0;
|
|
||||||
hcoll_destroy_context(hcoll_module->hcoll_context,
|
|
||||||
(rte_grp_handle_t)hcoll_module->comm,
|
|
||||||
&context_destroyed);
|
|
||||||
assert(context_destroyed);
|
|
||||||
|
|
||||||
|
if (hcoll_module->hcoll_context != NULL){
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_barrier_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_bcast_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_reduce_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_allreduce_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_allgather_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_allgatherv_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_gather_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_gatherv_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_alltoall_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_alltoallv_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_alltoallw_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_reduce_scatter_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_ibarrier_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_ibcast_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_iallreduce_module);
|
||||||
|
OBJ_RELEASE(hcoll_module->previous_iallgather_module);
|
||||||
|
|
||||||
|
context_destroyed = 0;
|
||||||
|
hcoll_destroy_context(hcoll_module->hcoll_context,
|
||||||
|
(rte_grp_handle_t)hcoll_module->comm,
|
||||||
|
&context_destroyed);
|
||||||
|
assert(context_destroyed);
|
||||||
|
}
|
||||||
mca_coll_hcoll_module_clear(hcoll_module);
|
mca_coll_hcoll_module_clear(hcoll_module);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,9 +101,10 @@ static void mca_coll_hcoll_module_destruct(mca_coll_hcoll_module_t *hcoll_module
|
|||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
|
||||||
static int __save_coll_handlers(mca_coll_hcoll_module_t *hcoll_module)
|
static int mca_coll_hcoll_save_coll_handlers(mca_coll_hcoll_module_t *hcoll_module)
|
||||||
{
|
{
|
||||||
ompi_communicator_t *comm = hcoll_module->comm;
|
ompi_communicator_t *comm;
|
||||||
|
comm = hcoll_module->comm;
|
||||||
|
|
||||||
HCOL_SAVE_PREV_COLL_API(barrier);
|
HCOL_SAVE_PREV_COLL_API(barrier);
|
||||||
HCOL_SAVE_PREV_COLL_API(bcast);
|
HCOL_SAVE_PREV_COLL_API(bcast);
|
||||||
@ -154,13 +126,16 @@ static int __save_coll_handlers(mca_coll_hcoll_module_t *hcoll_module)
|
|||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
** Communicator free callback
|
** Communicator free callback
|
||||||
*/
|
*/
|
||||||
int hcoll_comm_attr_del_fn(MPI_Comm comm, int keyval, void *attr_val, void *extra)
|
int hcoll_comm_attr_del_fn(MPI_Comm comm, int keyval, void *attr_val, void *extra)
|
||||||
{
|
{
|
||||||
|
|
||||||
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) attr_val;
|
mca_coll_hcoll_module_t *hcoll_module;
|
||||||
|
hcoll_module = (mca_coll_hcoll_module_t*) attr_val;
|
||||||
|
|
||||||
hcoll_group_destroy_notify(hcoll_module->hcoll_context);
|
hcoll_group_destroy_notify(hcoll_module->hcoll_context);
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
@ -172,36 +147,14 @@ int hcoll_comm_attr_del_fn(MPI_Comm comm, int keyval, void *attr_val, void *extr
|
|||||||
static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
|
static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
|
||||||
struct ompi_communicator_t *comm)
|
struct ompi_communicator_t *comm)
|
||||||
{
|
{
|
||||||
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*) module;
|
|
||||||
int ret;
|
int ret;
|
||||||
hcoll_module->comm = comm;
|
|
||||||
if (OMPI_SUCCESS != __save_coll_handlers(hcoll_module)){
|
if (OMPI_SUCCESS != mca_coll_hcoll_save_coll_handlers((mca_coll_hcoll_module_t *)module)){
|
||||||
HCOL_ERROR("coll_hcol: __save_coll_handlers failed");
|
HCOL_ERROR("coll_hcol: mca_coll_hcoll_save_coll_handlers failed");
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
hcoll_set_runtime_tag_offset(MCA_COLL_BASE_TAG_HCOLL_BASE, mca_pml.pml_max_tag);
|
ret = ompi_attr_set_c(COMM_ATTR, comm, &comm->c_keyhash, hcoll_comm_attr_keyval, (void *)module, false);
|
||||||
|
|
||||||
|
|
||||||
hcoll_module->hcoll_context =
|
|
||||||
hcoll_create_context((rte_grp_handle_t)comm);
|
|
||||||
if (NULL == hcoll_module->hcoll_context){
|
|
||||||
HCOL_VERBOSE(1,"hcoll_create_context returned NULL");
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
if (comm != &ompi_mpi_comm_world.comm){
|
|
||||||
mca_coll_hcoll_module_list_item_wrapper_t *mw =
|
|
||||||
OBJ_NEW(mca_coll_hcoll_module_list_item_wrapper_t);
|
|
||||||
mw->module = hcoll_module;
|
|
||||||
OBJ_RETAIN(hcoll_module->comm);
|
|
||||||
opal_list_append(&mca_coll_hcoll_component.active_modules,
|
|
||||||
(opal_list_item_t*)mw);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ret = ompi_attr_set_c(COMM_ATTR, comm, &comm->c_keyhash, hcoll_comm_attr_keyval, (void *)hcoll_module, false);
|
|
||||||
if (OMPI_SUCCESS != ret) {
|
if (OMPI_SUCCESS != ret) {
|
||||||
HCOL_VERBOSE(1,"hcoll ompi_attr_set_c failed");
|
HCOL_VERBOSE(1,"hcoll ompi_attr_set_c failed");
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
@ -212,55 +165,11 @@ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
|
|||||||
|
|
||||||
int mca_coll_hcoll_progress(void)
|
int mca_coll_hcoll_progress(void)
|
||||||
{
|
{
|
||||||
opal_list_item_t *item, *item_next;
|
|
||||||
opal_list_t *am;
|
|
||||||
mca_coll_hcoll_module_t *module;
|
|
||||||
ompi_communicator_t *comm;
|
|
||||||
int context_destroyed;
|
|
||||||
OPAL_THREAD_ADD32(&mca_coll_hcoll_component.progress_lock,1);
|
|
||||||
|
|
||||||
am = &mca_coll_hcoll_component.active_modules;
|
|
||||||
|
|
||||||
if (mca_coll_hcoll_component.progress_lock){
|
|
||||||
OPAL_THREAD_ADD32(&mca_coll_hcoll_component.progress_lock,-1);
|
|
||||||
(*hcoll_progress_fn)();
|
|
||||||
return OMPI_SUCCESS;
|
|
||||||
}
|
|
||||||
if (ompi_mpi_finalized){
|
if (ompi_mpi_finalized){
|
||||||
hcoll_rte_p2p_disabled_notify();
|
hcoll_rte_p2p_disabled_notify();
|
||||||
}
|
}
|
||||||
#if 0
|
|
||||||
item = opal_list_get_first(am);
|
|
||||||
while (item != opal_list_get_end(am)){
|
|
||||||
item_next = opal_list_get_next(item);
|
|
||||||
module = ((mca_coll_hcoll_module_list_item_wrapper_t *)item)->module;
|
|
||||||
comm = module->comm;
|
|
||||||
if (((opal_object_t*)comm)->obj_reference_count == 1){
|
|
||||||
/* Ok, if we are here then nobody owns a communicator pointed with comm except
|
|
||||||
* for coll_hcoll. Hence, it is safe to remove the hcoll context firstly and
|
|
||||||
* call release on the communicator.
|
|
||||||
*
|
|
||||||
* The call to hcoll_destroy_context is not blocking. The last parameter on the return
|
|
||||||
* indicates whether the context has been destroyd (1) or not (0). In the latter
|
|
||||||
* case one should call destroy again after some progressing
|
|
||||||
*/
|
|
||||||
context_destroyed = 0;
|
|
||||||
hcoll_destroy_context(module->hcoll_context,
|
|
||||||
(rte_grp_handle_t)comm,
|
|
||||||
&context_destroyed);
|
|
||||||
if (context_destroyed){
|
|
||||||
module->hcoll_context = NULL;
|
|
||||||
OBJ_RELEASE(comm);
|
|
||||||
opal_list_remove_item(am,item);
|
|
||||||
OBJ_RELEASE(item);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
item = item_next;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
(*hcoll_progress_fn)();
|
(*hcoll_progress_fn)();
|
||||||
OPAL_THREAD_ADD32(&mca_coll_hcoll_component.progress_lock,-1);
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -275,56 +184,75 @@ mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority)
|
|||||||
{
|
{
|
||||||
mca_coll_base_module_t *module;
|
mca_coll_base_module_t *module;
|
||||||
mca_coll_hcoll_module_t *hcoll_module;
|
mca_coll_hcoll_module_t *hcoll_module;
|
||||||
static bool libhcoll_initialized = false;
|
|
||||||
ompi_attribute_fn_ptr_union_t del_fn;
|
ompi_attribute_fn_ptr_union_t del_fn;
|
||||||
ompi_attribute_fn_ptr_union_t copy_fn;
|
ompi_attribute_fn_ptr_union_t copy_fn;
|
||||||
|
mca_coll_hcoll_component_t *cm;
|
||||||
int err;
|
int err;
|
||||||
int rc;
|
int rc;
|
||||||
|
cm = &mca_coll_hcoll_component;
|
||||||
*priority = 0;
|
*priority = 0;
|
||||||
module = NULL;
|
module = NULL;
|
||||||
|
|
||||||
if (!mca_coll_hcoll_component.hcoll_enable){
|
if (!cm->hcoll_enable){
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!libhcoll_initialized)
|
if (OMPI_COMM_IS_INTER(comm) || ompi_comm_size(comm) < cm->hcoll_np
|
||||||
|
|| ompi_comm_size(comm) < 2){
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (!cm->libhcoll_initialized)
|
||||||
{
|
{
|
||||||
/* libhcoll should be initialized here since current implmentation of
|
/* libhcoll should be initialized here since current implmentation of
|
||||||
mxm bcol in libhcoll needs world_group fully functional during init
|
mxm bcol in libhcoll needs world_group fully functional during init
|
||||||
world_group, i.e. ompi_comm_world, is not ready at hcoll component open
|
world_group, i.e. ompi_comm_world, is not ready at hcoll component open
|
||||||
call */
|
call */
|
||||||
opal_progress_register(mca_coll_hcoll_progress);
|
opal_progress_register(mca_coll_hcoll_progress);
|
||||||
|
|
||||||
|
hcoll_set_runtime_tag_offset(MCA_COLL_BASE_TAG_HCOLL_BASE, mca_pml.pml_max_tag);
|
||||||
|
|
||||||
|
HCOL_VERBOSE(10,"Calling hcoll_init();");
|
||||||
rc = hcoll_init();
|
rc = hcoll_init();
|
||||||
|
|
||||||
if (HCOLL_SUCCESS != rc){
|
if (HCOLL_SUCCESS != rc){
|
||||||
mca_coll_hcoll_component.hcoll_enable = 0;
|
cm->hcoll_enable = 0;
|
||||||
opal_progress_unregister(hcoll_progress_fn);
|
opal_progress_unregister(hcoll_progress_fn);
|
||||||
HCOL_VERBOSE(0,"Hcol library init failed");
|
HCOL_ERROR("Hcol library init failed");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
copy_fn.attr_communicator_copy_fn = (MPI_Comm_internal_copy_attr_function*) MPI_COMM_NULL_COPY_FN;
|
copy_fn.attr_communicator_copy_fn = (MPI_Comm_internal_copy_attr_function*) MPI_COMM_NULL_COPY_FN;
|
||||||
del_fn.attr_communicator_delete_fn = hcoll_comm_attr_del_fn;
|
del_fn.attr_communicator_delete_fn = hcoll_comm_attr_del_fn;
|
||||||
err = ompi_attr_create_keyval(COMM_ATTR, copy_fn, del_fn, &hcoll_comm_attr_keyval, NULL ,0, NULL);
|
err = ompi_attr_create_keyval(COMM_ATTR, copy_fn, del_fn, &hcoll_comm_attr_keyval, NULL ,0, NULL);
|
||||||
if (OMPI_SUCCESS != err) {
|
if (OMPI_SUCCESS != err) {
|
||||||
HCOL_VERBOSE(0,"Hcol comm keyval create failed");
|
HCOL_ERROR("Hcol comm keyval create failed");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
libhcoll_initialized = true;
|
cm->libhcoll_initialized = true;
|
||||||
}
|
}
|
||||||
hcoll_module = OBJ_NEW(mca_coll_hcoll_module_t);
|
hcoll_module = OBJ_NEW(mca_coll_hcoll_module_t);
|
||||||
if (!hcoll_module){
|
if (!hcoll_module){
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ompi_comm_size(comm) < 2 || OMPI_COMM_IS_INTER(comm)){
|
hcoll_module->comm = comm;
|
||||||
|
|
||||||
|
HCOL_VERBOSE(10,"Creating hcoll_context for comm %p, comm_id %d, comm_size %d",
|
||||||
|
(void*)comm,comm->c_contextid,ompi_comm_size(comm));
|
||||||
|
|
||||||
|
hcoll_module->hcoll_context =
|
||||||
|
hcoll_create_context((rte_grp_handle_t)comm);
|
||||||
|
|
||||||
|
if (NULL == hcoll_module->hcoll_context){
|
||||||
|
HCOL_VERBOSE(1,"hcoll_create_context returned NULL");
|
||||||
|
OBJ_RELEASE(hcoll_module);
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
hcoll_module->super.coll_module_enable = mca_coll_hcoll_module_enable;
|
hcoll_module->super.coll_module_enable = mca_coll_hcoll_module_enable;
|
||||||
hcoll_module->super.coll_barrier = hcoll_collectives.coll_barrier ? mca_coll_hcoll_barrier : NULL;
|
hcoll_module->super.coll_barrier = hcoll_collectives.coll_barrier ? mca_coll_hcoll_barrier : NULL;
|
||||||
hcoll_module->super.coll_bcast = hcoll_collectives.coll_bcast ? mca_coll_hcoll_bcast : NULL;
|
hcoll_module->super.coll_bcast = hcoll_collectives.coll_bcast ? mca_coll_hcoll_bcast : NULL;
|
||||||
@ -337,7 +265,7 @@ mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority)
|
|||||||
hcoll_module->super.coll_iallreduce = hcoll_collectives.coll_iallreduce ? mca_coll_hcoll_iallreduce : NULL;
|
hcoll_module->super.coll_iallreduce = hcoll_collectives.coll_iallreduce ? mca_coll_hcoll_iallreduce : NULL;
|
||||||
hcoll_module->super.coll_gather = hcoll_collectives.coll_gather ? mca_coll_hcoll_gather : NULL;
|
hcoll_module->super.coll_gather = hcoll_collectives.coll_gather ? mca_coll_hcoll_gather : NULL;
|
||||||
|
|
||||||
*priority = mca_coll_hcoll_component.hcoll_priority;
|
*priority = cm->hcoll_priority;
|
||||||
module = &hcoll_module->super;
|
module = &hcoll_module->super;
|
||||||
|
|
||||||
exit:
|
exit:
|
||||||
@ -352,7 +280,3 @@ OBJ_CLASS_INSTANCE(mca_coll_hcoll_module_t,
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(mca_coll_hcoll_module_list_item_wrapper_t,
|
|
||||||
opal_list_item_t,
|
|
||||||
NULL,NULL);
|
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user