1
1

Don't open PSM context when run on single node

When running many ranks on a single node using PSM, it's possible to
exhaust the network hardware contexts (there are 16).  This patch checks
if only a single node is being used. If so, the 'ipath' component of PSM
is disabled and no hardware contexts are opened.
Этот коммит содержится в:
Andrew Friedley 2014-10-28 08:46:51 -07:00
родитель 87dffacc56
Коммит 273135dbc7

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006-2010 QLogic Corporation. All rights reserved. * Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. * Copyright (c) 2012 Los Alamos National Security, LLC.
* All rights reserved. * All rights reserved.
* Copyright (c) 2014 Intel Corporation. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -186,6 +187,13 @@ ompi_mtl_psm_component_close(void)
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
static int
get_num_total_procs(int *out_ntp)
{
*out_ntp = (int)ompi_process_info.num_procs;
return OMPI_SUCCESS;
}
static int static int
get_num_local_procs(int *out_nlp) get_num_local_procs(int *out_nlp)
{ {
@ -218,6 +226,7 @@ ompi_mtl_psm_component_init(bool enable_progress_threads,
int verno_major = PSM_VERNO_MAJOR; int verno_major = PSM_VERNO_MAJOR;
int verno_minor = PSM_VERNO_MINOR; int verno_minor = PSM_VERNO_MINOR;
int local_rank = -1, num_local_procs = 0; int local_rank = -1, num_local_procs = 0;
int num_total_procs = 0;
/* Compute the total number of processes on this host and our local rank /* Compute the total number of processes on this host and our local rank
* on that node. We need to provide PSM with these values so it can * on that node. We need to provide PSM with these values so it can
@ -232,6 +241,12 @@ ompi_mtl_psm_component_init(bool enable_progress_threads,
opal_output(0, "Cannot determine local rank. Cannot continue.\n"); opal_output(0, "Cannot determine local rank. Cannot continue.\n");
return NULL; return NULL;
} }
if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) {
opal_output(0, "Cannot determine total number of processes. "
"Cannot continue.\n");
return NULL;
}
err = psm_error_register_handler(NULL /* no ep */, err = psm_error_register_handler(NULL /* no ep */,
PSM_ERRHANDLER_NOP); PSM_ERRHANDLER_NOP);
@ -254,16 +269,25 @@ ompi_mtl_psm_component_init(bool enable_progress_threads,
} }
#endif #endif
if (getenv("PSM_DEVICES") == NULL) {
/* Only allow for shm and ipath devices in 2.0 and earlier releases /* Only allow for shm and ipath devices in 2.0 and earlier releases
* (unless the user overrides the setting). * (unless the user overrides the setting).
*/ */
if (PSM_VERNO >= 0x0104) { if (PSM_VERNO >= 0x0104) {
if (num_local_procs == num_total_procs) {
setenv("PSM_DEVICES", "self,shm", 0);
} else {
setenv("PSM_DEVICES", "self,shm,ipath", 0); setenv("PSM_DEVICES", "self,shm,ipath", 0);
} }
}
else { else {
if (num_local_procs == num_total_procs) {
setenv("PSM_DEVICES", "shm", 0);
} else {
setenv("PSM_DEVICES", "shm,ipath", 0); setenv("PSM_DEVICES", "shm,ipath", 0);
} }
}
}
err = psm_init(&verno_major, &verno_minor); err = psm_init(&verno_major, &verno_minor);
if (err) { if (err) {