Share currently selected PML in the modex information, then check whenever
adding new procs that the remote proc's pml is the same as our local pml. Turns the hangs from mismatched PMLs into an abort, which is better, I think. This commit was SVN r13582.
Этот коммит содержится в:
родитель
5818a32245
Коммит
041beeb1b6
@ -36,6 +36,13 @@ OMPI_DECLSPEC int mca_pml_base_open(void);
|
||||
OMPI_DECLSPEC int mca_pml_base_progress(void);
|
||||
OMPI_DECLSPEC int mca_pml_base_select(bool enable_progress_threads,
|
||||
bool enable_mpi_threads);
|
||||
/* share in modex the name of the selected component */
|
||||
OMPI_DECLSPEC int mca_pml_base_pml_selected(const char *name);
|
||||
/* verify that all new procs are using the currently selected component */
|
||||
OMPI_DECLSPEC int mca_pml_base_pml_check_selected(const char *my_pml,
|
||||
struct ompi_proc_t **procs,
|
||||
size_t nprocs);
|
||||
|
||||
OMPI_DECLSPEC int mca_pml_base_close(void);
|
||||
|
||||
|
||||
|
@ -28,6 +28,8 @@
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "ompi/mca/pml/base/pml_base_module_exchange.h"
|
||||
|
||||
typedef struct opened_component_t {
|
||||
opal_list_item_t super;
|
||||
@ -178,7 +180,68 @@ int mca_pml_base_select(bool enable_progress_threads,
|
||||
/* register the winner's callback */
|
||||
opal_progress_register(mca_pml.pml_progress);
|
||||
|
||||
|
||||
/* register winner in the modex */
|
||||
mca_pml_base_pml_selected(best_component->pmlm_version.mca_component_name);
|
||||
|
||||
/* All done */
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* need a "commonly" named PML structure so everything ends up in the
|
||||
same modex field */
|
||||
static mca_base_component_t pml_base_component = {
|
||||
MCA_BASE_VERSION_1_0_0,
|
||||
"pml",
|
||||
MCA_BASE_VERSION_1_0_0,
|
||||
"base",
|
||||
MCA_BASE_VERSION_1_0_0,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
||||
int
|
||||
mca_pml_base_pml_selected(const char *name)
|
||||
{
|
||||
return mca_pml_base_modex_send(&pml_base_component, name, strlen(name) + 1);
|
||||
}
|
||||
|
||||
int
|
||||
mca_pml_base_pml_check_selected(const char *my_pml,
|
||||
ompi_proc_t **procs,
|
||||
size_t nprocs)
|
||||
{
|
||||
size_t i, size;
|
||||
int ret;
|
||||
char *remote_pml;
|
||||
|
||||
for (i = 0 ; i < nprocs ; ++i) {
|
||||
if (ompi_proc_local() == procs[i]) continue;
|
||||
|
||||
ret = mca_pml_base_modex_recv(&pml_base_component,
|
||||
procs[i],
|
||||
(void**) &remote_pml, &size);
|
||||
if (OMPI_SUCCESS != ret) return ret;
|
||||
if ((size != strlen(my_pml) + 1) ||
|
||||
(0 != strcmp(my_pml, remote_pml))) {
|
||||
if (procs[i]->proc_hostname) {
|
||||
opal_output(0, "[%lu,%lu,%lu] selected pml %s, but peer [%lu,%lu,%lu] on %s selected pml %s",
|
||||
ORTE_NAME_ARGS(&ompi_proc_local()->proc_name),
|
||||
my_pml, ORTE_NAME_ARGS(&procs[i]->proc_name),
|
||||
procs[i]->proc_hostname, remote_pml);
|
||||
} else {
|
||||
opal_output(0, "[%lu,%lu,%lu] selected pml %s, but peer [%lu,%lu,%lu] selected pml %s",
|
||||
ORTE_NAME_ARGS(&ompi_proc_local()->proc_name),
|
||||
my_pml, ORTE_NAME_ARGS(&procs[i]->proc_name),
|
||||
remote_pml);
|
||||
}
|
||||
return OMPI_ERR_UNREACH;
|
||||
}
|
||||
|
||||
free(remote_pml);
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "opal/class/opal_list.h"
|
||||
#include "ompi/mca/pml/base/pml_base_request.h"
|
||||
#include "ompi/mca/pml/base/pml_base_bsend.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
|
||||
#include "pml_cm.h"
|
||||
#include "pml_cm_sendreq.h"
|
||||
@ -87,6 +88,13 @@ mca_pml_cm_add_procs(struct ompi_proc_t** procs, size_t nprocs)
|
||||
}
|
||||
#endif
|
||||
|
||||
/* make sure remote procs are using the same PML as us */
|
||||
if (OMPI_SUCCESS != (ret = mca_pml_base_pml_check_selected("cm",
|
||||
procs,
|
||||
nprocs))) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
endpoints = (struct mca_mtl_base_endpoint_t**)malloc(nprocs * sizeof(struct mca_mtl_base_endpoint_t*));
|
||||
if (NULL == endpoints) return OMPI_ERROR;
|
||||
|
||||
|
@ -36,6 +36,7 @@
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
|
||||
mca_pml_dr_t mca_pml_dr = {
|
||||
{
|
||||
@ -120,6 +121,13 @@ int mca_pml_dr_add_procs(ompi_proc_t** procs, size_t nprocs)
|
||||
}
|
||||
#endif
|
||||
|
||||
/* make sure remote procs are using the same PML as us */
|
||||
if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("dr",
|
||||
procs,
|
||||
nprocs))) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&reachable, ompi_bitmap_t);
|
||||
rc = ompi_bitmap_init(&reachable, (int)nprocs);
|
||||
if(OMPI_SUCCESS != rc)
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "ompi/class/ompi_bitmap.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
#include "pml_ob1.h"
|
||||
#include "pml_ob1_component.h"
|
||||
@ -118,6 +119,13 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
|
||||
if(OMPI_SUCCESS != rc)
|
||||
return rc;
|
||||
|
||||
/* make sure remote procs are using the same PML as us */
|
||||
if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("ob1",
|
||||
procs,
|
||||
nprocs))) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
bml_endpoints = (struct mca_bml_base_endpoint_t **) malloc ( nprocs *
|
||||
sizeof(struct mca_bml_base_endpoint_t*));
|
||||
if ( NULL == bml_endpoints ) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user