e9b5162d79
Many of the OPAL_ENABLE_FT should be OPAL_ENABLE_FT_CR, so fix those. The OPAL Layer INC should call opal_output on restart so that it can refresh the string it prints to reflect the current pid/hostname which may have changed. This commit was SVN r22824.
402 строки
15 KiB
C
402 строки
15 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
|
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif
|
|
|
|
#include "opal/class/opal_list.h"
|
|
#include "opal/util/output.h"
|
|
#include "orte/util/show_help.h"
|
|
#include "opal/runtime/opal_progress.h"
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/runtime/opal.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/util/name_fns.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
#include "ompi/constants.h"
|
|
#include "ompi/mca/pml/pml.h"
|
|
#include "ompi/mca/pml/base/base.h"
|
|
#include "ompi/proc/proc.h"
|
|
#include "ompi/runtime/ompi_module_exchange.h"
|
|
|
|
typedef struct opened_component_t {
|
|
opal_list_item_t super;
|
|
mca_pml_base_component_t *om_component;
|
|
} opened_component_t;
|
|
|
|
static bool modex_reqd=false;
|
|
|
|
/**
|
|
* Function for selecting one component from all those that are
|
|
* available.
|
|
*
|
|
* Call the init function on all available components and get their
|
|
* priorities. Select the component with the highest priority. All
|
|
* other components will be closed and unloaded. The selected component
|
|
* will have all of its function pointers saved and returned to the
|
|
* caller.
|
|
*/
|
|
int mca_pml_base_select(bool enable_progress_threads,
|
|
bool enable_mpi_threads)
|
|
{
|
|
int i, priority = 0, best_priority = 0, num_pml = 0;
|
|
opal_list_item_t *item = NULL;
|
|
mca_base_component_list_item_t *cli = NULL;
|
|
mca_pml_base_component_t *component = NULL, *best_component = NULL;
|
|
mca_pml_base_module_t *module = NULL, *best_module = NULL;
|
|
opal_list_t opened;
|
|
opened_component_t *om = NULL;
|
|
bool found_pml;
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
mca_pml_base_component_t *wrapper_component = NULL;
|
|
mca_pml_base_module_t *wrapper_module = NULL;
|
|
int wrapper_priority = -1;
|
|
#endif
|
|
|
|
/* Traverse the list of available components; call their init
|
|
functions. */
|
|
|
|
best_priority = -1;
|
|
best_component = NULL;
|
|
module = NULL;
|
|
OBJ_CONSTRUCT(&opened, opal_list_t);
|
|
for (item = opal_list_get_first(&mca_pml_base_components_available);
|
|
opal_list_get_end(&mca_pml_base_components_available) != item;
|
|
item = opal_list_get_next(item) ) {
|
|
cli = (mca_base_component_list_item_t *) item;
|
|
component = (mca_pml_base_component_t *) cli->cli_component;
|
|
|
|
/* if there is an include list - item must be in the list to be included */
|
|
found_pml = false;
|
|
for( i = 0; i < opal_pointer_array_get_size(&mca_pml_base_pml); i++) {
|
|
char * tmp_val = NULL;
|
|
tmp_val = (char *) opal_pointer_array_get_item(&mca_pml_base_pml, i);
|
|
if( NULL == tmp_val) {
|
|
continue;
|
|
}
|
|
|
|
if(0 == strncmp(component->pmlm_version.mca_component_name,
|
|
tmp_val, strlen(component->pmlm_version.mca_component_name)) ) {
|
|
found_pml = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(!found_pml && opal_pointer_array_get_size(&mca_pml_base_pml)) {
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"select: component %s not in the include list",
|
|
component->pmlm_version.mca_component_name );
|
|
|
|
continue;
|
|
}
|
|
|
|
/* if there is no init function - ignore it */
|
|
if (NULL == component->pmlm_init) {
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"select: no init function; ignoring component %s",
|
|
component->pmlm_version.mca_component_name );
|
|
continue;
|
|
}
|
|
|
|
/* this is a pml that could be considered */
|
|
num_pml++;
|
|
|
|
/* Init component to get its priority */
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"select: initializing %s component %s",
|
|
component->pmlm_version.mca_type_name,
|
|
component->pmlm_version.mca_component_name );
|
|
priority = best_priority;
|
|
module = component->pmlm_init(&priority, enable_progress_threads,
|
|
enable_mpi_threads);
|
|
if (NULL == module) {
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"select: init returned failure for component %s",
|
|
component->pmlm_version.mca_component_name );
|
|
continue;
|
|
}
|
|
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"select: init returned priority %d", priority );
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
/* Determine if this is the wrapper component */
|
|
if( priority <= PML_SELECT_WRAPPER_PRIORITY) {
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"pml:select: Wrapper Component: Component %s was determined to be a Wrapper PML with priority %d",
|
|
component->pmlm_version.mca_component_name, priority );
|
|
wrapper_priority = priority;
|
|
wrapper_component = component;
|
|
wrapper_module = module;
|
|
continue;
|
|
}
|
|
/* Otherwise determine if this is the best component */
|
|
else
|
|
#endif
|
|
if (priority > best_priority) {
|
|
best_priority = priority;
|
|
best_component = component;
|
|
best_module = module;
|
|
}
|
|
|
|
om = (opened_component_t*)malloc(sizeof(opened_component_t));
|
|
if (NULL == om) {
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
OBJ_CONSTRUCT(om, opal_list_item_t);
|
|
om->om_component = component;
|
|
opal_list_append(&opened, (opal_list_item_t*) om);
|
|
}
|
|
|
|
/* Finished querying all components. Check for the bozo case. */
|
|
|
|
if( NULL == best_component ) {
|
|
orte_show_help("help-mca-base.txt", "find-available:none-found", true, "pml");
|
|
for( i = 0; i < opal_pointer_array_get_size(&mca_pml_base_pml); i++) {
|
|
char * tmp_val = NULL;
|
|
tmp_val = (char *) opal_pointer_array_get_item(&mca_pml_base_pml, i);
|
|
if( NULL == tmp_val) {
|
|
continue;
|
|
}
|
|
orte_errmgr.abort(1, "PML %s cannot be selected", tmp_val);
|
|
}
|
|
if(0 == i) {
|
|
orte_errmgr.abort(2, "No pml component available. This shouldn't happen.");
|
|
}
|
|
}
|
|
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"selected %s best priority %d\n",
|
|
best_component->pmlm_version.mca_component_name, best_priority);
|
|
|
|
/* if more than one PML could be considered, then we still need the
|
|
* modex since we cannot know which one will be selected on all procs
|
|
*/
|
|
if (1 < num_pml) {
|
|
modex_reqd = true;
|
|
}
|
|
|
|
/* Finalize all non-selected components */
|
|
|
|
for (item = opal_list_remove_first(&opened);
|
|
NULL != item;
|
|
item = opal_list_remove_first(&opened)) {
|
|
om = (opened_component_t *) item;
|
|
|
|
if (om->om_component != best_component
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
&& om->om_component != wrapper_component
|
|
#endif
|
|
) {
|
|
/* Finalize */
|
|
|
|
if (NULL != om->om_component->pmlm_finalize) {
|
|
|
|
/* Blatently ignore the return code (what would we do to
|
|
recover, anyway? This component is going away, so errors
|
|
don't matter anymore) */
|
|
|
|
om->om_component->pmlm_finalize();
|
|
opal_output_verbose(10, mca_pml_base_output,
|
|
"select: component %s not selected / finalized",
|
|
om->om_component->pmlm_version.mca_component_name);
|
|
}
|
|
}
|
|
OBJ_DESTRUCT( om );
|
|
free(om);
|
|
}
|
|
OBJ_DESTRUCT( &opened );
|
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
/* Remove the wrapper component from the mca_pml_base_components_available list
|
|
* so we don't unload it prematurely in the next call
|
|
*/
|
|
if( NULL != wrapper_component ) {
|
|
for (item = opal_list_get_first(&mca_pml_base_components_available);
|
|
item != opal_list_get_end(&mca_pml_base_components_available);
|
|
item = opal_list_get_next(item) ) {
|
|
cli = (mca_base_component_list_item_t *) item;
|
|
component = (mca_pml_base_component_t *) cli->cli_component;
|
|
|
|
if( component == wrapper_component ) {
|
|
opal_list_remove_item(&mca_pml_base_components_available, item);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/* Save the winner */
|
|
|
|
mca_pml_base_selected_component = *best_component;
|
|
mca_pml = *best_module;
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"select: component %s selected",
|
|
mca_pml_base_selected_component.pmlm_version.mca_component_name );
|
|
|
|
if (opal_profile) {
|
|
opal_output(0, "pml:%s", mca_pml_base_selected_component.pmlm_version.mca_component_name );
|
|
}
|
|
|
|
/* This base function closes, unloads, and removes from the
|
|
available list all unselected components. The available list will
|
|
contain only the selected component. */
|
|
|
|
mca_base_components_close(mca_pml_base_output,
|
|
&mca_pml_base_components_available,
|
|
(mca_base_component_t *) best_component);
|
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
|
/* If we have a wrapper then initalize it */
|
|
if( NULL != wrapper_component ) {
|
|
priority = PML_SELECT_WRAPPER_PRIORITY;
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"pml:select: Wrapping: Component %s [%d] is being wrapped by component %s [%d]",
|
|
mca_pml_base_selected_component.pmlm_version.mca_component_name,
|
|
best_priority,
|
|
wrapper_component->pmlm_version.mca_component_name,
|
|
wrapper_priority );
|
|
|
|
/* Ask the wrapper commponent to wrap around the currently
|
|
* selected component. Indicated by the priority value provided
|
|
* this will cause the wrapper to do something different this time around
|
|
*/
|
|
module = wrapper_component->pmlm_init(&priority,
|
|
enable_progress_threads,
|
|
enable_mpi_threads);
|
|
/* Replace with the wrapper */
|
|
best_component = wrapper_component;
|
|
mca_pml_base_selected_component = *best_component;
|
|
best_module = module;
|
|
mca_pml = *best_module;
|
|
}
|
|
#endif
|
|
|
|
/* register the winner's callback */
|
|
if( NULL != mca_pml.pml_progress ) {
|
|
opal_progress_register(mca_pml.pml_progress);
|
|
}
|
|
|
|
/* register winner in the modex */
|
|
if (modex_reqd && 0 == ORTE_PROC_MY_NAME->vpid) {
|
|
mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name);
|
|
}
|
|
|
|
/* All done */
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/* need a "commonly" named PML structure so everything ends up in the
|
|
same modex field */
|
|
static mca_base_component_t pml_base_component = {
|
|
MCA_BASE_VERSION_2_0_0,
|
|
"pml",
|
|
MCA_BASE_VERSION_2_0_0,
|
|
"base",
|
|
MCA_BASE_VERSION_2_0_0,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
|
|
int
|
|
mca_pml_base_pml_selected(const char *name)
|
|
{
|
|
return ompi_modex_send(&pml_base_component, name, strlen(name) + 1);
|
|
}
|
|
|
|
int
|
|
mca_pml_base_pml_check_selected(const char *my_pml,
|
|
ompi_proc_t **procs,
|
|
size_t nprocs)
|
|
{
|
|
size_t size;
|
|
int ret;
|
|
char *remote_pml;
|
|
|
|
/* if no modex was required by the PML, then
|
|
* we can assume success
|
|
*/
|
|
if (!modex_reqd) {
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"check:select: modex not reqd");
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/* if we are rank=0, then we can also assume success */
|
|
if (0 == ORTE_PROC_MY_NAME->vpid) {
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"check:select: rank=0");
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/* get the name of the PML module selected by rank=0 */
|
|
ret = ompi_modex_recv(&pml_base_component,
|
|
procs[0],
|
|
(void**) &remote_pml, &size);
|
|
|
|
/* if modex isn't implemented, then just assume all is well... */
|
|
if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"check:select: modex not implemented");
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
/* the remote pml returned should never be NULL if an error
|
|
* wasn't returned, but just to be safe, and since the check
|
|
* is fast...let's be sure
|
|
*/
|
|
if (NULL == remote_pml) {
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"check:select: got a NULL pml from rank=0");
|
|
return OMPI_ERR_UNREACH;
|
|
}
|
|
|
|
opal_output_verbose( 10, mca_pml_base_output,
|
|
"check:select: checking my pml %s against rank=0 pml %s",
|
|
my_pml, remote_pml);
|
|
|
|
/* if that module doesn't match my own, return an error */
|
|
if ((size != strlen(my_pml) + 1) ||
|
|
(0 != strcmp(my_pml, remote_pml))) {
|
|
if (procs[0]->proc_hostname) {
|
|
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
|
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
|
|
my_pml, ORTE_NAME_PRINT(&procs[0]->proc_name),
|
|
procs[0]->proc_hostname,
|
|
remote_pml);
|
|
} else {
|
|
opal_output(0, "%s selected pml %s, but peer %s selected pml %s",
|
|
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
|
|
my_pml, ORTE_NAME_PRINT(&procs[0]->proc_name),
|
|
remote_pml);
|
|
}
|
|
free(remote_pml); /* cleanup before returning */
|
|
return OMPI_ERR_UNREACH;
|
|
}
|
|
|
|
free(remote_pml);
|
|
return OMPI_SUCCESS;
|
|
}
|