1
1

PSM MTL: Don't connect procs already connected

PSM has issues when trying calling psm_ep_connect() more than once for a
specific peer.  Use the psm_ep_connect mask argument to avoid connecting
to processes that are already connected.

OMPI ticket #268.
Этот коммит содержится в:
Andrew Friedley 2014-11-12 15:43:54 -08:00
родитель a632b632ca
Коммит b97cda7fd9

Просмотреть файл

@ -258,6 +258,7 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
int i,j;
int rc;
psm_epid_t *epids_in = NULL;
int *mask_in = NULL;
psm_epid_t *epid;
psm_epaddr_t *epaddrs_out = NULL;
psm_error_t *errs_out = NULL, err;
@ -276,6 +277,10 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
if (epids_in == NULL) {
goto bail;
}
mask_in = (int *) malloc(nprocs * sizeof(int));
if (mask_in == NULL) {
goto bail;
}
epaddrs_out = (psm_epaddr_t *) malloc(nprocs * sizeof(psm_epaddr_t));
if (epaddrs_out == NULL) {
goto bail;
@ -284,12 +289,19 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
/* Get the epids for all the processes from modex */
for (i = 0; i < (int) nprocs; i++) {
if (NULL != procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]) {
/* Already connected: don't connect again */
mask_in[i] = 0;
continue;
}
OPAL_MODEX_RECV(rc, &mca_mtl_psm_component.super.mtl_version,
&procs[i]->super, (void**)&epid, &size);
if (rc != OMPI_SUCCESS || size != sizeof(psm_epid_t)) {
return OMPI_ERROR;
}
epids_in[i] = *epid;
mask_in[i] = 1;
}
timeout_in_secs = max(ompi_mtl_psm.connect_timeout, 0.5 * nprocs);
@ -299,7 +311,7 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
err = psm_ep_connect(ompi_mtl_psm.ep,
nprocs,
epids_in,
NULL, /* connect all */
mask_in,
errs_out,
epaddrs_out,
timeout_in_secs * 1e9);
@ -310,6 +322,10 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
psm_error_get_string(err));
}
for (i = 0; i < (int) nprocs; i++) {
if (0 == mask_in[i]) {
continue;
}
psm_error_t thiserr = errs_out[i];
errstr = (char *) ompi_mtl_psm_connect_error_msg(thiserr);
if (proc_errors[thiserr] == 0) {
@ -336,6 +352,10 @@ ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
/* Fill in endpoint data */
for (i = 0; i < (int) nprocs; i++) {
if (0 == mask_in[i]) {
continue;
}
mca_mtl_psm_endpoint_t *endpoint =
(mca_mtl_psm_endpoint_t *) OBJ_NEW(mca_mtl_psm_endpoint_t);
endpoint->peer_epid = epids_in[i];
@ -350,6 +370,9 @@ bail:
if (epids_in != NULL) {
free(epids_in);
}
if (mask_in != NULL) {
free(mask_in);
}
if (errs_out != NULL) {
free(errs_out);
}