1
1

Share currently selected PML in the modex information, then check whenever

adding new procs that the remote proc's pml is the same as our local pml.
Turns the hangs from mismatched PMLs into an abort, which is better,
I think.

This commit was SVN r13582.
Этот коммит содержится в:
Brian Barrett 2007-02-09 16:38:16 +00:00
родитель 5818a32245
Коммит 041beeb1b6
5 изменённых файлов: 94 добавлений и 0 удалений

Просмотреть файл

@ -36,6 +36,13 @@ OMPI_DECLSPEC int mca_pml_base_open(void);
OMPI_DECLSPEC int mca_pml_base_progress(void);
OMPI_DECLSPEC int mca_pml_base_select(bool enable_progress_threads,
bool enable_mpi_threads);
/* share in modex the name of the selected component */
OMPI_DECLSPEC int mca_pml_base_pml_selected(const char *name);
/* verify that all new procs are using the currently selected component */
OMPI_DECLSPEC int mca_pml_base_pml_check_selected(const char *my_pml,
struct ompi_proc_t **procs,
size_t nprocs);
OMPI_DECLSPEC int mca_pml_base_close(void);

Просмотреть файл

@ -28,6 +28,8 @@
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/pml/base/base.h"
#include "ompi/proc/proc.h"
#include "ompi/mca/pml/base/pml_base_module_exchange.h"
typedef struct opened_component_t {
opal_list_item_t super;
@ -178,7 +180,68 @@ int mca_pml_base_select(bool enable_progress_threads,
/* register the winner's callback */
opal_progress_register(mca_pml.pml_progress);
/* register winner in the modex */
mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name);
/* All done */
return OMPI_SUCCESS;
}
/* need a "commonly" named PML structure so everything ends up in the
same modex field */
static mca_base_component_t pml_base_component = {
MCA_BASE_VERSION_1_0_0,
"pml",
MCA_BASE_VERSION_1_0_0,
"base",
MCA_BASE_VERSION_1_0_0,
NULL,
NULL
};
int
mca_pml_base_pml_selected(const char *name)
{
return mca_pml_base_modex_send(&pml_base_component, name, strlen(name) + 1);
}
int
mca_pml_base_pml_check_selected(const char *my_pml,
ompi_proc_t **procs,
size_t nprocs)
{
size_t i, size;
int ret;
char *remote_pml;
for (i = 0 ; i < nprocs ; ++i) {
if (ompi_proc_local() == procs[i]) continue;
ret = mca_pml_base_modex_recv(&pml_base_component,
procs[i],
(void**) &remote_pml, &size);
if (OMPI_SUCCESS != ret) return ret;
if ((size != strlen(my_pml) + 1) ||
(0 != strcmp(my_pml, remote_pml))) {
if (procs[i]->proc_hostname) {
opal_output(0, "[%lu,%lu,%lu] selected pml %s, but peer [%lu,%lu,%lu] on %s selected pml %s",
ORTE_NAME_ARGS(&ompi_proc_local()->proc_name),
my_pml, ORTE_NAME_ARGS(&procs[i]->proc_name),
procs[i]->proc_hostname, remote_pml);
} else {
opal_output(0, "[%lu,%lu,%lu] selected pml %s, but peer [%lu,%lu,%lu] selected pml %s",
ORTE_NAME_ARGS(&ompi_proc_local()->proc_name),
my_pml, ORTE_NAME_ARGS(&procs[i]->proc_name),
remote_pml);
}
return OMPI_ERR_UNREACH;
}
free(remote_pml);
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -14,6 +14,7 @@
#include "opal/class/opal_list.h"
#include "ompi/mca/pml/base/pml_base_request.h"
#include "ompi/mca/pml/base/pml_base_bsend.h"
#include "ompi/mca/pml/base/base.h"
#include "pml_cm.h"
#include "pml_cm_sendreq.h"
@ -87,6 +88,13 @@ mca_pml_cm_add_procs(struct ompi_proc_t** procs, size_t nprocs)
}
#endif
/* make sure remote procs are using the same PML as us */
if (OMPI_SUCCESS != (ret = mca_pml_base_pml_check_selected("cm",
procs,
nprocs))) {
return ret;
}
endpoints = (struct mca_mtl_base_endpoint_t**)malloc(nprocs * sizeof(struct mca_mtl_base_endpoint_t*));
if (NULL == endpoints) return OMPI_ERROR;

Просмотреть файл

@ -36,6 +36,7 @@
#include "ompi/mca/bml/base/base.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "ompi/mca/pml/base/base.h"
mca_pml_dr_t mca_pml_dr = {
{
@ -120,6 +121,13 @@ int mca_pml_dr_add_procs(ompi_proc_t** procs, size_t nprocs)
}
#endif
/* make sure remote procs are using the same PML as us */
if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("dr",
procs,
nprocs))) {
return rc;
}
OBJ_CONSTRUCT(&reachable, ompi_bitmap_t);
rc = ompi_bitmap_init(&reachable, (int)nprocs);
if(OMPI_SUCCESS != rc)

Просмотреть файл

@ -24,6 +24,7 @@
#include "ompi/class/ompi_bitmap.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/pml/base/base.h"
#include "ompi/mca/btl/base/base.h"
#include "pml_ob1.h"
#include "pml_ob1_component.h"
@ -118,6 +119,13 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
if(OMPI_SUCCESS != rc)
return rc;
/* make sure remote procs are using the same PML as us */
if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("ob1",
procs,
nprocs))) {
return rc;
}
bml_endpoints = (struct mca_bml_base_endpoint_t **) malloc ( nprocs *
sizeof(struct mca_bml_base_endpoint_t*));
if ( NULL == bml_endpoints ) {