1
1

Update the PML selection logic to detect when a modex is required, and in those cases to only have rank=0 report its selected module. This is per the email thread on the devel list:

http://www.open-mpi.org/community/lists/devel/2008/06/4223.php

This commit was SVN r18747.
Этот коммит содержится в:
Ralph Castain 2008-06-26 13:22:48 +00:00
родитель 7d8ac08d5d
Коммит 3631a60181

Просмотреть файл

@ -40,6 +40,8 @@ typedef struct opened_component_t {
mca_pml_base_component_t *om_component; mca_pml_base_component_t *om_component;
} opened_component_t; } opened_component_t;
static bool modex_reqd=false;
/** /**
* Function for selecting one component from all those that are * Function for selecting one component from all those that are
* available. * available.
@ -53,7 +55,7 @@ typedef struct opened_component_t {
int mca_pml_base_select(bool enable_progress_threads, int mca_pml_base_select(bool enable_progress_threads,
bool enable_mpi_threads) bool enable_mpi_threads)
{ {
int i, priority = 0, best_priority = 0; int i, priority = 0, best_priority = 0, num_pml = 0;
opal_list_item_t *item = NULL; opal_list_item_t *item = NULL;
mca_base_component_list_item_t *cli = NULL; mca_base_component_list_item_t *cli = NULL;
mca_pml_base_component_t *component = NULL, *best_component = NULL; mca_pml_base_component_t *component = NULL, *best_component = NULL;
@ -112,6 +114,9 @@ int mca_pml_base_select(bool enable_progress_threads,
continue; continue;
} }
/* this is a pml that could be considered */
num_pml++;
/* Init component to get its priority */ /* Init component to get its priority */
opal_output_verbose( 10, mca_pml_base_output, opal_output_verbose( 10, mca_pml_base_output,
"select: initializing %s component %s", "select: initializing %s component %s",
@ -179,6 +184,13 @@ int mca_pml_base_select(bool enable_progress_threads,
"selected %s best priority %d\n", "selected %s best priority %d\n",
best_component->pmlm_version.mca_component_name, best_priority); best_component->pmlm_version.mca_component_name, best_priority);
/* if more than one PML could be considered, then we still need the
* modex since we cannot know which one will be selected on all procs
*/
if (1 < num_pml) {
modex_reqd = true;
}
/* Finalize all non-selected components */ /* Finalize all non-selected components */
for (item = opal_list_remove_first(&opened); for (item = opal_list_remove_first(&opened);
@ -276,7 +288,9 @@ int mca_pml_base_select(bool enable_progress_threads,
} }
/* register winner in the modex */ /* register winner in the modex */
mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name); if (modex_reqd && 0 == ORTE_PROC_MY_NAME->vpid) {
mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name);
}
/* All done */ /* All done */
@ -307,37 +321,60 @@ mca_pml_base_pml_check_selected(const char *my_pml,
ompi_proc_t **procs, ompi_proc_t **procs,
size_t nprocs) size_t nprocs)
{ {
size_t i, size; size_t size;
int ret; int ret;
char *remote_pml; char *remote_pml;
for (i = 0 ; i < nprocs ; ++i) { /* if no modex was required by the PML, then
if (ompi_proc_local() == procs[i]) continue; * we can assume success
*/
ret = ompi_modex_recv(&pml_base_component, if (!modex_reqd) {
procs[i], opal_output_verbose( 10, mca_pml_base_output,
(void**) &remote_pml, &size); "check:select: modex not reqd");
/* if modex isn't implemented, then just assume all is well... */ return OMPI_SUCCESS;
if (OMPI_ERR_NOT_IMPLEMENTED == ret) return OMPI_SUCCESS; }
if (OMPI_SUCCESS != ret) return ret;
if ((size != strlen(my_pml) + 1) || /* if we are rank=0, then we can also assume success */
(0 != strcmp(my_pml, remote_pml))) { if (0 == ORTE_PROC_MY_NAME->vpid) {
if (procs[i]->proc_hostname) { opal_output_verbose( 10, mca_pml_base_output,
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s", "check:select: rank=0");
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name), return OMPI_SUCCESS;
my_pml, ORTE_NAME_PRINT(&procs[i]->proc_name), }
procs[i]->proc_hostname, remote_pml);
} else { /* get the name of the PML module selected by rank=0 */
opal_output(0, "%s selected pml %s, but peer %s selected pml %s", ret = ompi_modex_recv(&pml_base_component,
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name), procs[0],
my_pml, ORTE_NAME_PRINT(&procs[i]->proc_name), (void**) &remote_pml, &size);
remote_pml);
} /* if modex isn't implemented, then just assume all is well... */
return OMPI_ERR_UNREACH; if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
} opal_output_verbose( 10, mca_pml_base_output,
"check:select: modex not implemented");
free(remote_pml); return OMPI_SUCCESS;
} }
opal_output_verbose( 10, mca_pml_base_output,
"check:select: checking my pml %s against rank=0 pml %s",
my_pml, remote_pml);
/* if that module doesn't match my own, return an error */
if ((size != strlen(my_pml) + 1) ||
(0 != strcmp(my_pml, remote_pml))) {
if (procs[0]->proc_hostname) {
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
my_pml, ORTE_NAME_PRINT(&procs[0]->proc_name),
procs[0]->proc_hostname,
(NULL == remote_pml) ? "NULL" : remote_pml);
} else {
opal_output(0, "%s selected pml %s, but peer %s selected pml %s",
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
my_pml, ORTE_NAME_PRINT(&procs[0]->proc_name),
(NULL == remote_pml) ? "NULL" : remote_pml);
}
return OMPI_ERR_UNREACH;
}
free(remote_pml);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }