PSM MTL: Don't connect procs already connected
PSM has issues when trying calling psm_ep_connect() more than once for a specific peer. Use the psm_ep_connect mask argument to avoid connecting to processes that are already connected. OMPI ticket #268.
Этот коммит содержится в:
родитель
a632b632ca
Коммит
b97cda7fd9
@ -258,6 +258,7 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
int i,j;
|
||||
int rc;
|
||||
psm_epid_t *epids_in = NULL;
|
||||
int *mask_in = NULL;
|
||||
psm_epid_t *epid;
|
||||
psm_epaddr_t *epaddrs_out = NULL;
|
||||
psm_error_t *errs_out = NULL, err;
|
||||
@ -276,6 +277,10 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
if (epids_in == NULL) {
|
||||
goto bail;
|
||||
}
|
||||
mask_in = (int *) malloc(nprocs * sizeof(int));
|
||||
if (mask_in == NULL) {
|
||||
goto bail;
|
||||
}
|
||||
epaddrs_out = (psm_epaddr_t *) malloc(nprocs * sizeof(psm_epaddr_t));
|
||||
if (epaddrs_out == NULL) {
|
||||
goto bail;
|
||||
@ -284,12 +289,19 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
|
||||
/* Get the epids for all the processes from modex */
|
||||
for (i = 0; i < (int) nprocs; i++) {
|
||||
if (NULL != procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]) {
|
||||
/* Already connected: don't connect again */
|
||||
mask_in[i] = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_MODEX_RECV(rc, &mca_mtl_psm_component.super.mtl_version,
|
||||
&procs[i]->super, (void**)&epid, &size);
|
||||
if (rc != OMPI_SUCCESS || size != sizeof(psm_epid_t)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
epids_in[i] = *epid;
|
||||
mask_in[i] = 1;
|
||||
}
|
||||
|
||||
timeout_in_secs = max(ompi_mtl_psm.connect_timeout, 0.5 * nprocs);
|
||||
@ -299,7 +311,7 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
err = psm_ep_connect(ompi_mtl_psm.ep,
|
||||
nprocs,
|
||||
epids_in,
|
||||
NULL, /* connect all */
|
||||
mask_in,
|
||||
errs_out,
|
||||
epaddrs_out,
|
||||
timeout_in_secs * 1e9);
|
||||
@ -310,6 +322,10 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
psm_error_get_string(err));
|
||||
}
|
||||
for (i = 0; i < (int) nprocs; i++) {
|
||||
if (0 == mask_in[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
psm_error_t thiserr = errs_out[i];
|
||||
errstr = (char *) ompi_mtl_psm_connect_error_msg(thiserr);
|
||||
if (proc_errors[thiserr] == 0) {
|
||||
@ -336,6 +352,10 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
|
||||
|
||||
/* Fill in endpoint data */
|
||||
for (i = 0; i < (int) nprocs; i++) {
|
||||
if (0 == mask_in[i]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
mca_mtl_psm_endpoint_t *endpoint =
|
||||
(mca_mtl_psm_endpoint_t *) OBJ_NEW(mca_mtl_psm_endpoint_t);
|
||||
endpoint->peer_epid = epids_in[i];
|
||||
@ -350,6 +370,9 @@ bail:
|
||||
if (epids_in != NULL) {
|
||||
free(epids_in);
|
||||
}
|
||||
if (mask_in != NULL) {
|
||||
free(mask_in);
|
||||
}
|
||||
if (errs_out != NULL) {
|
||||
free(errs_out);
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user