Update the PML selection logic to detect when a modex is required, and in those cases to only have rank=0 report its selected module. This is per the email thread on the devel list:
http://www.open-mpi.org/community/lists/devel/2008/06/4223.php This commit was SVN r18747.
Этот коммит содержится в:
родитель
7d8ac08d5d
Коммит
3631a60181
@ -40,6 +40,8 @@ typedef struct opened_component_t {
|
|||||||
mca_pml_base_component_t *om_component;
|
mca_pml_base_component_t *om_component;
|
||||||
} opened_component_t;
|
} opened_component_t;
|
||||||
|
|
||||||
|
static bool modex_reqd=false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function for selecting one component from all those that are
|
* Function for selecting one component from all those that are
|
||||||
* available.
|
* available.
|
||||||
@ -53,7 +55,7 @@ typedef struct opened_component_t {
|
|||||||
int mca_pml_base_select(bool enable_progress_threads,
|
int mca_pml_base_select(bool enable_progress_threads,
|
||||||
bool enable_mpi_threads)
|
bool enable_mpi_threads)
|
||||||
{
|
{
|
||||||
int i, priority = 0, best_priority = 0;
|
int i, priority = 0, best_priority = 0, num_pml = 0;
|
||||||
opal_list_item_t *item = NULL;
|
opal_list_item_t *item = NULL;
|
||||||
mca_base_component_list_item_t *cli = NULL;
|
mca_base_component_list_item_t *cli = NULL;
|
||||||
mca_pml_base_component_t *component = NULL, *best_component = NULL;
|
mca_pml_base_component_t *component = NULL, *best_component = NULL;
|
||||||
@ -112,6 +114,9 @@ int mca_pml_base_select(bool enable_progress_threads,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* this is a pml that could be considered */
|
||||||
|
num_pml++;
|
||||||
|
|
||||||
/* Init component to get its priority */
|
/* Init component to get its priority */
|
||||||
opal_output_verbose( 10, mca_pml_base_output,
|
opal_output_verbose( 10, mca_pml_base_output,
|
||||||
"select: initializing %s component %s",
|
"select: initializing %s component %s",
|
||||||
@ -179,6 +184,13 @@ int mca_pml_base_select(bool enable_progress_threads,
|
|||||||
"selected %s best priority %d\n",
|
"selected %s best priority %d\n",
|
||||||
best_component->pmlm_version.mca_component_name, best_priority);
|
best_component->pmlm_version.mca_component_name, best_priority);
|
||||||
|
|
||||||
|
/* if more than one PML could be considered, then we still need the
|
||||||
|
* modex since we cannot know which one will be selected on all procs
|
||||||
|
*/
|
||||||
|
if (1 < num_pml) {
|
||||||
|
modex_reqd = true;
|
||||||
|
}
|
||||||
|
|
||||||
/* Finalize all non-selected components */
|
/* Finalize all non-selected components */
|
||||||
|
|
||||||
for (item = opal_list_remove_first(&opened);
|
for (item = opal_list_remove_first(&opened);
|
||||||
@ -276,7 +288,9 @@ int mca_pml_base_select(bool enable_progress_threads,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* register winner in the modex */
|
/* register winner in the modex */
|
||||||
mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name);
|
if (modex_reqd && 0 == ORTE_PROC_MY_NAME->vpid) {
|
||||||
|
mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name);
|
||||||
|
}
|
||||||
|
|
||||||
/* All done */
|
/* All done */
|
||||||
|
|
||||||
@ -307,37 +321,60 @@ mca_pml_base_pml_check_selected(const char *my_pml,
|
|||||||
ompi_proc_t **procs,
|
ompi_proc_t **procs,
|
||||||
size_t nprocs)
|
size_t nprocs)
|
||||||
{
|
{
|
||||||
size_t i, size;
|
size_t size;
|
||||||
int ret;
|
int ret;
|
||||||
char *remote_pml;
|
char *remote_pml;
|
||||||
|
|
||||||
for (i = 0 ; i < nprocs ; ++i) {
|
/* if no modex was required by the PML, then
|
||||||
if (ompi_proc_local() == procs[i]) continue;
|
* we can assume success
|
||||||
|
*/
|
||||||
ret = ompi_modex_recv(&pml_base_component,
|
if (!modex_reqd) {
|
||||||
procs[i],
|
opal_output_verbose( 10, mca_pml_base_output,
|
||||||
(void**) &remote_pml, &size);
|
"check:select: modex not reqd");
|
||||||
/* if modex isn't implemented, then just assume all is well... */
|
return OMPI_SUCCESS;
|
||||||
if (OMPI_ERR_NOT_IMPLEMENTED == ret) return OMPI_SUCCESS;
|
}
|
||||||
if (OMPI_SUCCESS != ret) return ret;
|
|
||||||
if ((size != strlen(my_pml) + 1) ||
|
/* if we are rank=0, then we can also assume success */
|
||||||
(0 != strcmp(my_pml, remote_pml))) {
|
if (0 == ORTE_PROC_MY_NAME->vpid) {
|
||||||
if (procs[i]->proc_hostname) {
|
opal_output_verbose( 10, mca_pml_base_output,
|
||||||
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
"check:select: rank=0");
|
||||||
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
|
return OMPI_SUCCESS;
|
||||||
my_pml, ORTE_NAME_PRINT(&procs[i]->proc_name),
|
}
|
||||||
procs[i]->proc_hostname, remote_pml);
|
|
||||||
} else {
|
/* get the name of the PML module selected by rank=0 */
|
||||||
opal_output(0, "%s selected pml %s, but peer %s selected pml %s",
|
ret = ompi_modex_recv(&pml_base_component,
|
||||||
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
|
procs[0],
|
||||||
my_pml, ORTE_NAME_PRINT(&procs[i]->proc_name),
|
(void**) &remote_pml, &size);
|
||||||
remote_pml);
|
|
||||||
}
|
/* if modex isn't implemented, then just assume all is well... */
|
||||||
return OMPI_ERR_UNREACH;
|
if (OMPI_ERR_NOT_IMPLEMENTED == ret) {
|
||||||
}
|
opal_output_verbose( 10, mca_pml_base_output,
|
||||||
|
"check:select: modex not implemented");
|
||||||
free(remote_pml);
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opal_output_verbose( 10, mca_pml_base_output,
|
||||||
|
"check:select: checking my pml %s against rank=0 pml %s",
|
||||||
|
my_pml, remote_pml);
|
||||||
|
|
||||||
|
/* if that module doesn't match my own, return an error */
|
||||||
|
if ((size != strlen(my_pml) + 1) ||
|
||||||
|
(0 != strcmp(my_pml, remote_pml))) {
|
||||||
|
if (procs[0]->proc_hostname) {
|
||||||
|
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
|
||||||
|
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
|
||||||
|
my_pml, ORTE_NAME_PRINT(&procs[0]->proc_name),
|
||||||
|
procs[0]->proc_hostname,
|
||||||
|
(NULL == remote_pml) ? "NULL" : remote_pml);
|
||||||
|
} else {
|
||||||
|
opal_output(0, "%s selected pml %s, but peer %s selected pml %s",
|
||||||
|
ORTE_NAME_PRINT(&ompi_proc_local()->proc_name),
|
||||||
|
my_pml, ORTE_NAME_PRINT(&procs[0]->proc_name),
|
||||||
|
(NULL == remote_pml) ? "NULL" : remote_pml);
|
||||||
|
}
|
||||||
|
return OMPI_ERR_UNREACH;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(remote_pml);
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user