2007-12-21 06:02:00 +00:00
|
|
|
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
2004-01-30 03:54:52 +00:00
|
|
|
/*
|
2010-03-12 23:57:50 +00:00
|
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
2005-11-05 19:57:48 +00:00
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2007-12-21 06:02:00 +00:00
|
|
|
* Copyright (c) 2004-2007 The University of Tennessee and The University
|
2005-11-05 19:57:48 +00:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-11-28 20:09:25 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 12:43:37 +00:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2012-04-06 14:23:13 +00:00
|
|
|
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
|
|
|
* reserved.
|
As per the email discussion, revise the sparse handling of hostnames so that we avoid potential infinite loops while allowing large-scale users to improve their startup time:
* add a new MCA param orte_hostname_cutoff to specify the number of nodes at which we stop including hostnames. This defaults to INT_MAX => always include hostnames. If a value is given, then we will include hostnames for any allocation smaller than the given limit.
* remove ompi_proc_get_hostname. Replace all occurrences with a direct link to ompi_proc_t's proc_hostname, protected by appropriate "if NULL"
* modify the OMPI-ORTE integration component so that any call to modex_recv automatically loads the ompi_proc_t->proc_hostname field as well as returning the requested info. Thus, any process whose modex info you retrieve will automatically receive the hostname. Note that on-demand retrieval is still enabled - i.e., if we are running under direct launch with PMI, the hostname will be fetched upon first call to modex_recv, and then the ompi_proc_t->proc_hostname field will be loaded
* removed a stale MCA param "mpi_keep_peer_hostnames" that was no longer used anywhere in the code base
* added an envar lookup in ess/pmi for the number of nodes in the allocation. Sadly, PMI itself doesn't provide that info, so we have to get it a different way. Currently, we support PBS-based systems and SLURM - for any other, rank0 will emit a warning and we assume max number of daemons so we will always retain hostnames
This commit was SVN r29052.
2013-08-20 18:59:36 +00:00
|
|
|
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
2004-11-22 01:38:40 +00:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-01-30 03:54:52 +00:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2004-06-07 15:33:53 +00:00
|
|
|
#include "ompi_config.h"
|
2004-01-30 03:54:52 +00:00
|
|
|
|
2009-03-13 02:10:32 +00:00
|
|
|
#ifdef HAVE_STRING_H
|
|
|
|
#include <string.h>
|
|
|
|
#endif
|
|
|
|
|
2005-07-03 16:22:16 +00:00
|
|
|
#include "opal/class/opal_list.h"
|
2009-02-14 02:26:12 +00:00
|
|
|
#include "opal/util/output.h"
|
2013-02-12 21:10:11 +00:00
|
|
|
#include "opal/util/show_help.h"
|
2005-09-12 20:22:59 +00:00
|
|
|
#include "opal/runtime/opal_progress.h"
|
2005-09-14 09:37:20 +00:00
|
|
|
#include "opal/mca/mca.h"
|
|
|
|
#include "opal/mca/base/base.h"
|
2008-12-09 23:49:02 +00:00
|
|
|
#include "opal/runtime/opal.h"
|
2008-02-28 01:57:57 +00:00
|
|
|
|
2006-02-12 01:33:29 +00:00
|
|
|
#include "ompi/constants.h"
|
2005-09-14 09:37:20 +00:00
|
|
|
#include "ompi/mca/pml/pml.h"
|
|
|
|
#include "ompi/mca/pml/base/base.h"
|
2007-02-09 16:38:16 +00:00
|
|
|
#include "ompi/proc/proc.h"
|
2007-07-09 17:16:34 +00:00
|
|
|
#include "ompi/runtime/ompi_module_exchange.h"
|
2004-01-30 03:54:52 +00:00
|
|
|
|
2004-08-02 00:24:22 +00:00
|
|
|
typedef struct opened_component_t {
|
2005-07-03 16:22:16 +00:00
|
|
|
opal_list_item_t super;
|
2004-08-02 00:24:22 +00:00
|
|
|
mca_pml_base_component_t *om_component;
|
|
|
|
} opened_component_t;
|
2004-01-30 03:54:52 +00:00
|
|
|
|
2008-06-26 13:22:48 +00:00
|
|
|
static bool modex_reqd=false;
|
|
|
|
|
2004-01-30 03:54:52 +00:00
|
|
|
/**
|
2004-08-02 00:24:22 +00:00
|
|
|
* Function for selecting one component from all those that are
|
2004-01-30 03:54:52 +00:00
|
|
|
* available.
|
|
|
|
*
|
2004-08-02 00:24:22 +00:00
|
|
|
* Call the init function on all available components and get their
|
|
|
|
* priorities. Select the component with the highest priority. All
|
|
|
|
* other components will be closed and unloaded. The selected component
|
2004-01-30 23:00:48 +00:00
|
|
|
* will have all of its function pointers saved and returned to the
|
|
|
|
* caller.
|
2004-01-30 03:54:52 +00:00
|
|
|
*/
|
2005-03-27 13:05:23 +00:00
|
|
|
int mca_pml_base_select(bool enable_progress_threads,
|
|
|
|
bool enable_mpi_threads)
|
2004-01-30 03:54:52 +00:00
|
|
|
{
|
2008-06-26 13:22:48 +00:00
|
|
|
int i, priority = 0, best_priority = 0, num_pml = 0;
|
2005-07-03 16:22:16 +00:00
|
|
|
opal_list_item_t *item = NULL;
|
2005-06-20 16:40:12 +00:00
|
|
|
mca_base_component_list_item_t *cli = NULL;
|
|
|
|
mca_pml_base_component_t *component = NULL, *best_component = NULL;
|
|
|
|
mca_pml_base_module_t *module = NULL, *best_module = NULL;
|
2005-07-03 16:22:16 +00:00
|
|
|
opal_list_t opened;
|
2005-06-20 16:40:12 +00:00
|
|
|
opened_component_t *om = NULL;
|
2007-02-03 02:32:00 +00:00
|
|
|
bool found_pml;
|
2010-03-12 23:57:50 +00:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2007-04-24 17:08:48 +00:00
|
|
|
mca_pml_base_component_t *wrapper_component = NULL;
|
|
|
|
int wrapper_priority = -1;
|
|
|
|
#endif
|
2007-02-21 16:18:43 +00:00
|
|
|
|
2005-06-20 16:40:12 +00:00
|
|
|
/* Traverse the list of available components; call their init
|
|
|
|
functions. */
|
|
|
|
|
|
|
|
best_priority = -1;
|
|
|
|
best_component = NULL;
|
|
|
|
module = NULL;
|
2005-07-03 16:22:16 +00:00
|
|
|
OBJ_CONSTRUCT(&opened, opal_list_t);
|
2013-03-27 21:17:31 +00:00
|
|
|
OPAL_LIST_FOREACH(cli, &ompi_pml_base_framework.framework_components, mca_base_component_list_item_t) {
|
2005-06-20 16:40:12 +00:00
|
|
|
component = (mca_pml_base_component_t *) cli->cli_component;
|
2007-02-21 16:18:43 +00:00
|
|
|
|
2005-06-20 16:40:12 +00:00
|
|
|
/* if there is an include list - item must be in the list to be included */
|
2007-02-03 02:32:00 +00:00
|
|
|
found_pml = false;
|
2007-12-21 06:02:00 +00:00
|
|
|
for( i = 0; i < opal_pointer_array_get_size(&mca_pml_base_pml); i++) {
|
2007-02-21 16:18:43 +00:00
|
|
|
char * tmp_val = NULL;
|
2007-12-21 06:02:00 +00:00
|
|
|
tmp_val = (char *) opal_pointer_array_get_item(&mca_pml_base_pml, i);
|
2007-02-21 16:18:43 +00:00
|
|
|
if( NULL == tmp_val) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(0 == strncmp(component->pmlm_version.mca_component_name,
|
|
|
|
tmp_val, strlen(component->pmlm_version.mca_component_name)) ) {
|
2007-02-03 02:32:00 +00:00
|
|
|
found_pml = true;
|
2007-04-05 13:52:05 +00:00
|
|
|
break;
|
2007-02-03 02:01:18 +00:00
|
|
|
}
|
|
|
|
}
|
2007-02-21 16:18:43 +00:00
|
|
|
|
2007-12-21 06:02:00 +00:00
|
|
|
if(!found_pml && opal_pointer_array_get_size(&mca_pml_base_pml)) {
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2007-02-03 02:32:00 +00:00
|
|
|
"select: component %s not in the include list",
|
|
|
|
component->pmlm_version.mca_component_name );
|
|
|
|
|
2005-06-20 16:40:12 +00:00
|
|
|
continue;
|
2005-07-12 05:40:56 +00:00
|
|
|
}
|
2007-02-21 16:18:43 +00:00
|
|
|
|
|
|
|
/* if there is no init function - ignore it */
|
2005-06-20 16:40:12 +00:00
|
|
|
if (NULL == component->pmlm_init) {
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2005-07-12 05:40:56 +00:00
|
|
|
"select: no init function; ignoring component %s",
|
|
|
|
component->pmlm_version.mca_component_name );
|
|
|
|
continue;
|
2004-01-30 03:54:52 +00:00
|
|
|
}
|
2007-02-21 16:18:43 +00:00
|
|
|
|
2008-06-26 13:22:48 +00:00
|
|
|
/* this is a pml that could be considered */
|
|
|
|
num_pml++;
|
|
|
|
|
2007-02-21 16:18:43 +00:00
|
|
|
/* Init component to get its priority */
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2005-07-12 05:40:56 +00:00
|
|
|
"select: initializing %s component %s",
|
|
|
|
component->pmlm_version.mca_type_name,
|
|
|
|
component->pmlm_version.mca_component_name );
|
2007-02-03 02:01:18 +00:00
|
|
|
priority = best_priority;
|
2005-07-12 05:40:56 +00:00
|
|
|
module = component->pmlm_init(&priority, enable_progress_threads,
|
|
|
|
enable_mpi_threads);
|
|
|
|
if (NULL == module) {
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2005-07-12 05:40:56 +00:00
|
|
|
"select: init returned failure for component %s",
|
|
|
|
component->pmlm_version.mca_component_name );
|
|
|
|
continue;
|
|
|
|
}
|
2007-02-21 16:18:43 +00:00
|
|
|
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2005-07-12 05:40:56 +00:00
|
|
|
"select: init returned priority %d", priority );
|
2010-03-12 23:57:50 +00:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2007-03-16 23:11:45 +00:00
|
|
|
/* Determine if this is the wrapper component */
|
|
|
|
if( priority <= PML_SELECT_WRAPPER_PRIORITY) {
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2007-03-16 23:11:45 +00:00
|
|
|
"pml:select: Wrapper Component: Component %s was determined to be a Wrapper PML with priority %d",
|
|
|
|
component->pmlm_version.mca_component_name, priority );
|
|
|
|
wrapper_priority = priority;
|
|
|
|
wrapper_component = component;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* Otherwise determine if this is the best component */
|
2007-04-24 17:08:48 +00:00
|
|
|
else
|
|
|
|
#endif
|
|
|
|
if (priority > best_priority) {
|
2005-07-12 05:40:56 +00:00
|
|
|
best_priority = priority;
|
|
|
|
best_component = component;
|
|
|
|
best_module = module;
|
|
|
|
}
|
|
|
|
|
2006-08-24 16:38:08 +00:00
|
|
|
om = (opened_component_t*)malloc(sizeof(opened_component_t));
|
2005-07-12 05:40:56 +00:00
|
|
|
if (NULL == om) {
|
|
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
}
|
|
|
|
OBJ_CONSTRUCT(om, opal_list_item_t);
|
|
|
|
om->om_component = component;
|
|
|
|
opal_list_append(&opened, (opal_list_item_t*) om);
|
2004-01-30 03:54:52 +00:00
|
|
|
}
|
|
|
|
|
2005-07-12 05:40:56 +00:00
|
|
|
/* Finished querying all components. Check for the bozo case. */
|
2007-02-03 02:01:18 +00:00
|
|
|
|
2005-07-12 05:40:56 +00:00
|
|
|
if( NULL == best_component ) {
|
2013-02-12 21:10:11 +00:00
|
|
|
opal_show_help("help-mca-base.txt", "find-available:none-found", true, "pml");
|
2007-12-21 06:02:00 +00:00
|
|
|
for( i = 0; i < opal_pointer_array_get_size(&mca_pml_base_pml); i++) {
|
2007-02-21 16:18:43 +00:00
|
|
|
char * tmp_val = NULL;
|
2007-12-21 06:02:00 +00:00
|
|
|
tmp_val = (char *) opal_pointer_array_get_item(&mca_pml_base_pml, i);
|
2007-02-21 16:18:43 +00:00
|
|
|
if( NULL == tmp_val) {
|
|
|
|
continue;
|
|
|
|
}
|
2013-01-27 23:25:10 +00:00
|
|
|
ompi_rte_abort(1, "PML %s cannot be selected", tmp_val);
|
2007-02-03 02:01:18 +00:00
|
|
|
}
|
|
|
|
if(0 == i) {
|
2013-01-27 23:25:10 +00:00
|
|
|
ompi_rte_abort(2, "No pml component available. This shouldn't happen.");
|
2005-07-12 19:30:51 +00:00
|
|
|
}
|
2005-07-12 05:40:56 +00:00
|
|
|
}
|
2007-02-03 02:01:18 +00:00
|
|
|
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2007-02-03 02:01:18 +00:00
|
|
|
"selected %s best priority %d\n",
|
|
|
|
best_component->pmlm_version.mca_component_name, best_priority);
|
|
|
|
|
2008-06-26 13:22:48 +00:00
|
|
|
/* if more than one PML could be considered, then we still need the
|
|
|
|
* modex since we cannot know which one will be selected on all procs
|
|
|
|
*/
|
|
|
|
if (1 < num_pml) {
|
|
|
|
modex_reqd = true;
|
|
|
|
}
|
|
|
|
|
2005-07-12 05:40:56 +00:00
|
|
|
/* Finalize all non-selected components */
|
2004-01-30 03:54:52 +00:00
|
|
|
|
2005-07-12 05:40:56 +00:00
|
|
|
for (item = opal_list_remove_first(&opened);
|
|
|
|
NULL != item;
|
|
|
|
item = opal_list_remove_first(&opened)) {
|
|
|
|
om = (opened_component_t *) item;
|
2007-04-24 17:08:48 +00:00
|
|
|
|
|
|
|
if (om->om_component != best_component
|
2010-03-12 23:57:50 +00:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2007-04-24 17:08:48 +00:00
|
|
|
&& om->om_component != wrapper_component
|
|
|
|
#endif
|
|
|
|
) {
|
2007-03-17 16:33:43 +00:00
|
|
|
/* Finalize */
|
2007-03-16 23:11:45 +00:00
|
|
|
|
2007-03-17 16:33:43 +00:00
|
|
|
if (NULL != om->om_component->pmlm_finalize) {
|
2007-03-16 23:11:45 +00:00
|
|
|
|
2007-03-17 16:33:43 +00:00
|
|
|
/* Blatently ignore the return code (what would we do to
|
|
|
|
recover, anyway? This component is going away, so errors
|
|
|
|
don't matter anymore) */
|
|
|
|
|
|
|
|
om->om_component->pmlm_finalize();
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose(10, ompi_pml_base_framework.framework_output,
|
2007-03-17 16:33:43 +00:00
|
|
|
"select: component %s not selected / finalized",
|
|
|
|
om->om_component->pmlm_version.mca_component_name);
|
2005-07-12 05:40:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
OBJ_DESTRUCT( om );
|
|
|
|
free(om);
|
2004-01-30 03:54:52 +00:00
|
|
|
}
|
2005-07-12 05:40:56 +00:00
|
|
|
OBJ_DESTRUCT( &opened );
|
2007-03-16 23:11:45 +00:00
|
|
|
|
2010-03-12 23:57:50 +00:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2013-03-27 21:17:31 +00:00
|
|
|
/* Remove the wrapper component from the ompi_pml_base_framework.framework_components list
|
2007-03-16 23:11:45 +00:00
|
|
|
* so we don't unload it prematurely in the next call
|
|
|
|
*/
|
|
|
|
if( NULL != wrapper_component ) {
|
2013-03-27 21:17:31 +00:00
|
|
|
OPAL_LIST_FOREACH(cli, &ompi_pml_base_framework.framework_components, mca_base_component_list_item_t) {
|
2007-03-16 23:11:45 +00:00
|
|
|
component = (mca_pml_base_component_t *) cli->cli_component;
|
|
|
|
|
|
|
|
if( component == wrapper_component ) {
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_list_remove_item(&ompi_pml_base_framework.framework_components, item);
|
2007-03-16 23:11:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2007-04-24 17:08:48 +00:00
|
|
|
#endif
|
2007-03-16 23:11:45 +00:00
|
|
|
|
2005-07-12 05:40:56 +00:00
|
|
|
/* Save the winner */
|
|
|
|
|
|
|
|
mca_pml_base_selected_component = *best_component;
|
|
|
|
mca_pml = *best_module;
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2005-07-12 05:40:56 +00:00
|
|
|
"select: component %s selected",
|
2007-08-16 05:46:11 +00:00
|
|
|
mca_pml_base_selected_component.pmlm_version.mca_component_name );
|
2007-03-16 23:11:45 +00:00
|
|
|
|
2007-08-03 02:23:24 +00:00
|
|
|
/* This base function closes, unloads, and removes from the
|
|
|
|
available list all unselected components. The available list will
|
|
|
|
contain only the selected component. */
|
|
|
|
|
2013-03-27 21:17:31 +00:00
|
|
|
mca_base_components_close(ompi_pml_base_framework.framework_output,
|
|
|
|
&ompi_pml_base_framework.framework_components,
|
2007-08-03 02:23:24 +00:00
|
|
|
(mca_base_component_t *) best_component);
|
|
|
|
|
2010-03-12 23:57:50 +00:00
|
|
|
#if OPAL_ENABLE_FT_CR == 1
|
2007-03-16 23:11:45 +00:00
|
|
|
/* If we have a wrapper then initalize it */
|
|
|
|
if( NULL != wrapper_component ) {
|
|
|
|
priority = PML_SELECT_WRAPPER_PRIORITY;
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2007-03-16 23:11:45 +00:00
|
|
|
"pml:select: Wrapping: Component %s [%d] is being wrapped by component %s [%d]",
|
|
|
|
mca_pml_base_selected_component.pmlm_version.mca_component_name,
|
|
|
|
best_priority,
|
|
|
|
wrapper_component->pmlm_version.mca_component_name,
|
|
|
|
wrapper_priority );
|
|
|
|
|
|
|
|
/* Ask the wrapper commponent to wrap around the currently
|
|
|
|
* selected component. Indicated by the priority value provided
|
|
|
|
* this will cause the wrapper to do something different this time around
|
|
|
|
*/
|
|
|
|
module = wrapper_component->pmlm_init(&priority,
|
|
|
|
enable_progress_threads,
|
|
|
|
enable_mpi_threads);
|
|
|
|
/* Replace with the wrapper */
|
|
|
|
best_component = wrapper_component;
|
|
|
|
mca_pml_base_selected_component = *best_component;
|
|
|
|
best_module = module;
|
|
|
|
mca_pml = *best_module;
|
|
|
|
}
|
2007-04-24 17:08:48 +00:00
|
|
|
#endif
|
2007-03-16 23:11:45 +00:00
|
|
|
|
2005-07-12 05:40:56 +00:00
|
|
|
/* register the winner's callback */
|
2007-10-09 23:28:53 +00:00
|
|
|
if( NULL != mca_pml.pml_progress ) {
|
|
|
|
opal_progress_register(mca_pml.pml_progress);
|
|
|
|
}
|
2007-02-09 16:38:16 +00:00
|
|
|
|
|
|
|
/* register winner in the modex */
|
2013-01-27 23:25:10 +00:00
|
|
|
if (modex_reqd && 0 == OMPI_PROC_MY_NAME->vpid) {
|
2008-06-26 13:22:48 +00:00
|
|
|
mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name);
|
|
|
|
}
|
2007-02-09 16:38:16 +00:00
|
|
|
|
2005-07-12 05:40:56 +00:00
|
|
|
/* All done */
|
|
|
|
|
|
|
|
return OMPI_SUCCESS;
|
2004-01-30 03:54:52 +00:00
|
|
|
}
|
2007-02-09 16:38:16 +00:00
|
|
|
|
|
|
|
/* need a "commonly" named PML structure so everything ends up in the
|
|
|
|
same modex field */
|
|
|
|
static mca_base_component_t pml_base_component = {
|
2008-07-28 22:40:57 +00:00
|
|
|
MCA_BASE_VERSION_2_0_0,
|
2007-02-09 16:38:16 +00:00
|
|
|
"pml",
|
2008-07-28 22:40:57 +00:00
|
|
|
MCA_BASE_VERSION_2_0_0,
|
2007-02-09 16:38:16 +00:00
|
|
|
"base",
|
2008-07-28 22:40:57 +00:00
|
|
|
MCA_BASE_VERSION_2_0_0,
|
2007-02-09 16:38:16 +00:00
|
|
|
NULL,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
mca_pml_base_pml_selected(const char *name)
|
|
|
|
{
|
2007-07-09 17:16:34 +00:00
|
|
|
return ompi_modex_send(&pml_base_component, name, strlen(name) + 1);
|
2007-02-09 16:38:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
mca_pml_base_pml_check_selected(const char *my_pml,
|
|
|
|
ompi_proc_t **procs,
|
|
|
|
size_t nprocs)
|
|
|
|
{
|
2008-06-26 13:22:48 +00:00
|
|
|
size_t size;
|
2007-02-09 16:38:16 +00:00
|
|
|
int ret;
|
|
|
|
char *remote_pml;
|
|
|
|
|
2008-06-26 13:22:48 +00:00
|
|
|
/* if no modex was required by the PML, then
|
|
|
|
* we can assume success
|
|
|
|
*/
|
|
|
|
if (!modex_reqd) {
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2008-06-26 13:22:48 +00:00
|
|
|
"check:select: modex not reqd");
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if we are rank=0, then we can also assume success */
|
2013-01-27 23:25:10 +00:00
|
|
|
if (0 == OMPI_PROC_MY_NAME->vpid) {
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2008-06-26 13:22:48 +00:00
|
|
|
"check:select: rank=0");
|
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* get the name of the PML module selected by rank=0 */
|
|
|
|
ret = ompi_modex_recv(&pml_base_component,
|
|
|
|
procs[0],
|
|
|
|
(void**) &remote_pml, &size);
|
|
|
|
|
2013-09-25 16:04:00 +00:00
|
|
|
/* if this key wasn't found, then just assume all is well... */
|
|
|
|
if (OMPI_SUCCESS != ret) {
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2013-09-25 16:04:00 +00:00
|
|
|
"check:select: modex data not found");
|
2008-06-26 13:22:48 +00:00
|
|
|
return OMPI_SUCCESS;
|
2007-02-09 16:38:16 +00:00
|
|
|
}
|
|
|
|
|
2008-06-26 14:08:36 +00:00
|
|
|
/* the remote pml returned should never be NULL if an error
|
|
|
|
* wasn't returned, but just to be safe, and since the check
|
|
|
|
* is fast...let's be sure
|
|
|
|
*/
|
|
|
|
if (NULL == remote_pml) {
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2008-06-26 14:08:36 +00:00
|
|
|
"check:select: got a NULL pml from rank=0");
|
|
|
|
return OMPI_ERR_UNREACH;
|
|
|
|
}
|
|
|
|
|
2013-03-27 21:17:31 +00:00
|
|
|
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
|
2008-06-26 13:22:48 +00:00
|
|
|
"check:select: checking my pml %s against rank=0 pml %s",
|
|
|
|
my_pml, remote_pml);
|
|
|
|
|
|
|
|
/* if that module doesn't match my own, return an error */
|
|
|
|
if ((size != strlen(my_pml) + 1) ||
|
|
|
|
(0 != strcmp(my_pml, remote_pml))) {
|
As per the email discussion, revise the sparse handling of hostnames so that we avoid potential infinite loops while allowing large-scale users to improve their startup time:
* add a new MCA param orte_hostname_cutoff to specify the number of nodes at which we stop including hostnames. This defaults to INT_MAX => always include hostnames. If a value is given, then we will include hostnames for any allocation smaller than the given limit.
* remove ompi_proc_get_hostname. Replace all occurrences with a direct link to ompi_proc_t's proc_hostname, protected by appropriate "if NULL"
* modify the OMPI-ORTE integration component so that any call to modex_recv automatically loads the ompi_proc_t->proc_hostname field as well as returning the requested info. Thus, any process whose modex info you retrieve will automatically receive the hostname. Note that on-demand retrieval is still enabled - i.e., if we are running under direct launch with PMI, the hostname will be fetched upon first call to modex_recv, and then the ompi_proc_t->proc_hostname field will be loaded
* removed a stale MCA param "mpi_keep_peer_hostnames" that was no longer used anywhere in the code base
* added an envar lookup in ess/pmi for the number of nodes in the allocation. Sadly, PMI itself doesn't provide that info, so we have to get it a different way. Currently, we support PBS-based systems and SLURM - for any other, rank0 will emit a warning and we assume max number of daemons so we will always retain hostnames
This commit was SVN r29052.
2013-08-20 18:59:36 +00:00
|
|
|
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
|
|
|
OMPI_NAME_PRINT(&ompi_proc_local()->proc_name),
|
|
|
|
my_pml, OMPI_NAME_PRINT(&procs[0]->proc_name),
|
|
|
|
(NULL == procs[0]->proc_hostname) ? "unknown" : procs[0]->proc_hostname,
|
|
|
|
remote_pml);
|
2008-06-26 13:31:36 +00:00
|
|
|
free(remote_pml); /* cleanup before returning */
|
2008-06-26 13:22:48 +00:00
|
|
|
return OMPI_ERR_UNREACH;
|
|
|
|
}
|
|
|
|
|
|
|
|
free(remote_pml);
|
2007-02-09 16:38:16 +00:00
|
|
|
return OMPI_SUCCESS;
|
|
|
|
}
|