1
1
The monitoring PML hides it's existence from the OMPI infrastructure by
removing itself from the list of PML loaded components, remaining hidden
until MPI_Finalize.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
Этот коммит содержится в:
George Bosilca 2018-09-22 19:23:46 -04:00
родитель 668aa15dda
Коммит dc972f0b92
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 09C926752C9F09B1
2 изменённых файлов: 70 добавлений и 55 удалений

Просмотреть файл

@ -95,50 +95,6 @@ static int mca_pml_monitoring_component_open(void)
return OMPI_SUCCESS;
}
static int mca_pml_monitoring_component_close(void)
{
if( !mca_common_monitoring_enabled ) return OMPI_SUCCESS;
/**
* If this component is already active, then we are currently monitoring
* the execution and this call to close if the one from MPI_Finalize.
* Clean up and release the extra reference on ourselves.
*/
if( mca_pml_monitoring_active ) { /* Already active, turn off */
pml_selected_component.pmlm_version.mca_close_component();
mca_base_component_repository_release((mca_base_component_t*)&mca_pml_monitoring_component);
mca_pml_monitoring_active = 0;
return OMPI_SUCCESS;
}
/**
* We are supposed to monitor the execution. Save the winner PML component and
* module, and swap it with ourselves. Increase our refcount so that we are
* not dlclose.
*/
if( OPAL_SUCCESS != mca_base_component_repository_retain_component(mca_pml_monitoring_component.pmlm_version.mca_type_name,
mca_pml_monitoring_component.pmlm_version.mca_component_name) ) {
return OMPI_ERROR;
}
/* Save a copy of the selected PML */
pml_selected_component = mca_pml_base_selected_component;
pml_selected_module = mca_pml;
/* Install our interception layer */
mca_pml_base_selected_component = mca_pml_monitoring_component;
mca_pml = mca_pml_monitoring_module;
/* Restore some of the original values: progress, flags, tags and context id */
mca_pml.pml_progress = pml_selected_module.pml_progress;
mca_pml.pml_max_contextid = pml_selected_module.pml_max_contextid;
mca_pml.pml_max_tag = pml_selected_module.pml_max_tag;
/* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
mca_pml.pml_flags = pml_selected_module.pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD;
mca_pml_monitoring_active = 1;
return OMPI_SUCCESS;
}
static mca_pml_base_module_t*
mca_pml_monitoring_component_init(int* priority,
bool enable_progress_threads,
@ -154,19 +110,72 @@ mca_pml_monitoring_component_init(int* priority,
static int mca_pml_monitoring_component_finish(void)
{
if( mca_common_monitoring_enabled && mca_pml_monitoring_active ) {
/* Free internal data structure */
mca_common_monitoring_finalize();
if( !mca_common_monitoring_enabled )
return OMPI_SUCCESS;
if( !mca_pml_monitoring_active ) {
/* The monitoring component priority is always low to guarantee that the component
* is never selected. Thus, the first time component_finish is called it is right
* after the selection of the best PML was done, and the perfect moment to intercept
* it. At this point we remove ourselves from ompi_pml_base_framework.framework_components
* so that the component never gets closed and unloaded and it's VARs are safe for
* the rest of the execution.
*/
mca_pml_base_component_t *component = NULL;
mca_base_component_list_item_t *cli = NULL;
OPAL_LIST_FOREACH(cli, &ompi_pml_base_framework.framework_components, mca_base_component_list_item_t) {
component = (mca_pml_base_component_t *) cli->cli_component;
if( component == &mca_pml_monitoring_component ) {
opal_list_remove_item(&ompi_pml_base_framework.framework_components, (opal_list_item_t*)cli);
OBJ_RELEASE(cli);
break;
}
}
/**
* We are supposed to monitor the execution. Save the winner PML component and
* module, and swap it with ourselves. Increase our refcount so that we are
* not dlclose.
*/
/* Save a copy of the selected PML */
pml_selected_component = mca_pml_base_selected_component;
pml_selected_module = mca_pml;
/* Install our interception layer */
mca_pml_base_selected_component = mca_pml_monitoring_component;
mca_pml = mca_pml_monitoring_module;
/* Restore some of the original values: progress, flags, tags and context id */
mca_pml.pml_progress = pml_selected_module.pml_progress;
mca_pml.pml_max_contextid = pml_selected_module.pml_max_contextid;
mca_pml.pml_max_tag = pml_selected_module.pml_max_tag;
/* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
mca_pml.pml_flags = pml_selected_module.pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD;
mca_pml_monitoring_active = 1;
} else {
/**
* This is the second call to component_finalize, and the component is actively
* intercepting the calls to the best PML. Time to stop and cleanly finalize ourself.
*/
/* Restore the original PML */
mca_pml_base_selected_component = pml_selected_component;
mca_pml = pml_selected_module;
/* Redirect the close call to the original PML */
pml_selected_component.pmlm_finalize();
/* Free internal data structure */
mca_common_monitoring_finalize();
/**
* We should never release the last ref on the current
* component or face forever punishement.
* We are in the compoenent code itself, we need to prevent the dlloader from
* removing the code. This will result in minimal memory leaks, but it is the only
* way to remove most of the references to the component (including the *vars).
*/
/* mca_base_component_repository_release(&mca_common_monitoring_component.pmlm_version); */
mca_base_component_repository_retain_component(mca_pml_monitoring_component.pmlm_version.mca_type_name,
mca_pml_monitoring_component.pmlm_version.mca_component_name);
/* Release all memory and be gone. */
mca_base_component_close((mca_base_component_t*)&mca_pml_monitoring_component,
ompi_pml_base_framework.framework_output);
}
return OMPI_SUCCESS;
}
@ -188,7 +197,7 @@ mca_pml_base_component_2_0_0_t mca_pml_monitoring_component = {
.mca_component_name = "monitoring", /* MCA component name */
MCA_MONITORING_MAKE_VERSION,
.mca_open_component = mca_pml_monitoring_component_open, /* component open */
.mca_close_component = mca_pml_monitoring_component_close, /* component close */
.mca_close_component = NULL, /* component close */
.mca_register_component_params = mca_pml_monitoring_component_register
},
.pmlm_data = {

Просмотреть файл

@ -50,10 +50,16 @@ void mca_base_component_close (const mca_base_component_t *component, int output
{
/* Close */
if (NULL != component->mca_close_component) {
component->mca_close_component();
opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, output_id,
"mca: base: close: component %s closed",
component->mca_component_name);
if( OPAL_SUCCESS == component->mca_close_component() ) {
opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, output_id,
"mca: base: close: component %s closed",
component->mca_component_name);
} else {
opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, output_id,
"mca: base: close: component %s refused to close [drop it]",
component->mca_component_name);
return;
}
}
mca_base_component_unload (component, output_id);