Don't open PSM context when run on single node
When running many ranks on a single node using PSM, it's possible to exhaust the network hardware contexts (there are 16). This patch checks if only a single node is being used. If so, the 'ipath' component of PSM is disabled and no hardware contexts are opened.
Этот коммит содержится в:
родитель
87dffacc56
Коммит
273135dbc7
@ -12,6 +12,7 @@
|
|||||||
* Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
|
* Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
|
||||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2014 Intel Corporation. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -186,6 +187,13 @@ ompi_mtl_psm_component_close(void)
|
|||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
get_num_total_procs(int *out_ntp)
|
||||||
|
{
|
||||||
|
*out_ntp = (int)ompi_process_info.num_procs;
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
get_num_local_procs(int *out_nlp)
|
get_num_local_procs(int *out_nlp)
|
||||||
{
|
{
|
||||||
@ -218,6 +226,7 @@ ompi_mtl_psm_component_init(bool enable_progress_threads,
|
|||||||
int verno_major = PSM_VERNO_MAJOR;
|
int verno_major = PSM_VERNO_MAJOR;
|
||||||
int verno_minor = PSM_VERNO_MINOR;
|
int verno_minor = PSM_VERNO_MINOR;
|
||||||
int local_rank = -1, num_local_procs = 0;
|
int local_rank = -1, num_local_procs = 0;
|
||||||
|
int num_total_procs = 0;
|
||||||
|
|
||||||
/* Compute the total number of processes on this host and our local rank
|
/* Compute the total number of processes on this host and our local rank
|
||||||
* on that node. We need to provide PSM with these values so it can
|
* on that node. We need to provide PSM with these values so it can
|
||||||
@ -232,6 +241,12 @@ ompi_mtl_psm_component_init(bool enable_progress_threads,
|
|||||||
opal_output(0, "Cannot determine local rank. Cannot continue.\n");
|
opal_output(0, "Cannot determine local rank. Cannot continue.\n");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) {
|
||||||
|
opal_output(0, "Cannot determine total number of processes. "
|
||||||
|
"Cannot continue.\n");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
err = psm_error_register_handler(NULL /* no ep */,
|
err = psm_error_register_handler(NULL /* no ep */,
|
||||||
PSM_ERRHANDLER_NOP);
|
PSM_ERRHANDLER_NOP);
|
||||||
@ -254,16 +269,25 @@ ompi_mtl_psm_component_init(bool enable_progress_threads,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (getenv("PSM_DEVICES") == NULL) {
|
||||||
/* Only allow for shm and ipath devices in 2.0 and earlier releases
|
/* Only allow for shm and ipath devices in 2.0 and earlier releases
|
||||||
* (unless the user overrides the setting).
|
* (unless the user overrides the setting).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if (PSM_VERNO >= 0x0104) {
|
if (PSM_VERNO >= 0x0104) {
|
||||||
|
if (num_local_procs == num_total_procs) {
|
||||||
|
setenv("PSM_DEVICES", "self,shm", 0);
|
||||||
|
} else {
|
||||||
setenv("PSM_DEVICES", "self,shm,ipath", 0);
|
setenv("PSM_DEVICES", "self,shm,ipath", 0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
|
if (num_local_procs == num_total_procs) {
|
||||||
|
setenv("PSM_DEVICES", "shm", 0);
|
||||||
|
} else {
|
||||||
setenv("PSM_DEVICES", "shm,ipath", 0);
|
setenv("PSM_DEVICES", "shm,ipath", 0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
err = psm_init(&verno_major, &verno_minor);
|
err = psm_init(&verno_major, &verno_minor);
|
||||||
if (err) {
|
if (err) {
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user