Merge pull request #4323 from aravindksg/fix_help_text
Move help text output regarding PSM2_CUDA environment variable
Этот коммит содержится в:
Коммит
df48ddd2a1
@ -1,7 +1,7 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (C) 2009. QLogic Corporation. All rights reserved.
|
||||
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -47,5 +47,17 @@ Unknown path record query mechanism %s. Supported mechanisms are %s.
|
||||
Message size %llu bigger than supported by PSM2 API. Max = %llu
|
||||
#
|
||||
[no psm2 cuda env]
|
||||
Using CUDA enabled OpenMPI but PSM2_CUDA environment variable is %s.
|
||||
This is not a recommended combination. If the application uses %s.
|
||||
Warning: Open MPI has detected that you are running in an environment with CUDA
|
||||
devices present and that you are using Intel(r) Ompi-Path networking. However,
|
||||
the environment variable PSM2_CUDA was not set, meaning that the PSM2 Omni-Path
|
||||
networking library was not told how to handle CUDA support.
|
||||
|
||||
If your application uses CUDA buffers, you should set the environment variable
|
||||
PSM2_CUDA to 1; otherwise, set it to 0. Setting the variable to the wrong value
|
||||
can have performance implications on your application, or even cause it to
|
||||
crash.
|
||||
|
||||
Since it was not set, Open MPI has defaulted to setting the PSM2_CUDA
|
||||
environment variable to 1.
|
||||
|
||||
Local hostname: %s
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2006-2010 QLogic Corporation. All rights reserved.
|
||||
* Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2017 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -28,6 +28,7 @@
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
|
||||
#include "mtl_psm2.h"
|
||||
@ -45,6 +46,10 @@ static int param_priority;
|
||||
/* MPI_THREAD_MULTIPLE_SUPPORT */
|
||||
opal_mutex_t mtl_psm2_mq_mutex = OPAL_MUTEX_STATIC_INIT;
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
static bool cuda_envvar_set = false;
|
||||
#endif
|
||||
|
||||
static int ompi_mtl_psm2_component_open(void);
|
||||
static int ompi_mtl_psm2_component_close(void);
|
||||
static int ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority);
|
||||
@ -201,9 +206,6 @@ static int
|
||||
ompi_mtl_psm2_component_register(void)
|
||||
{
|
||||
int num_local_procs, num_total_procs;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
char *cuda_env;
|
||||
#endif
|
||||
|
||||
ompi_mtl_psm2.connect_timeout = 180;
|
||||
(void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version,
|
||||
@ -228,30 +230,6 @@ ompi_mtl_psm2_component_register(void)
|
||||
param_priority = 40;
|
||||
}
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
/*
|
||||
* If using CUDA enabled OpenMPI, the user likely intends to
|
||||
* run with CUDA buffers. So, force-set the envvar here if user failed
|
||||
* to set it.
|
||||
*/
|
||||
cuda_env = getenv("PSM2_CUDA");
|
||||
if (!cuda_env) {
|
||||
opal_show_help("help-mtl-psm2.txt",
|
||||
"no psm2 cuda env", true,
|
||||
"not set",
|
||||
"Host buffers,\nthere will be a performance penalty"
|
||||
" due to OMPI force setting this variable now.\n"
|
||||
"Set environment variable to 0 if using Host buffers" );
|
||||
setenv("PSM2_CUDA", "1", 0);
|
||||
} else if (strcmp(cuda_env, "0") == 0) {
|
||||
opal_show_help("help-mtl-psm2.txt",
|
||||
"no psm2 cuda env", true,
|
||||
"set to 0",
|
||||
"CUDA buffers,\nthe execution will SEGFAULT."
|
||||
" Set environment variable to 1 if using CUDA buffers");
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
|
||||
"priority", "Priority of the PSM2 MTL component",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
@ -272,17 +250,16 @@ static int
|
||||
ompi_mtl_psm2_component_open(void)
|
||||
{
|
||||
int res;
|
||||
glob_t globbuf;
|
||||
globbuf.gl_offs = 0;
|
||||
glob_t globbuf = {0};
|
||||
|
||||
/* Component available only if Omni-Path hardware is present */
|
||||
res = glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf);
|
||||
if (0 == res || GLOB_NOMATCH == res) {
|
||||
if (globbuf.gl_pathc > 0) {
|
||||
globfree(&globbuf);
|
||||
}
|
||||
if (0 != res) {
|
||||
res = glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf);
|
||||
if (0 == res || GLOB_NOMATCH == res) {
|
||||
if (globbuf.gl_pathc > 0) {
|
||||
globfree(&globbuf);
|
||||
}
|
||||
if (0 != res) {
|
||||
@ -336,6 +313,11 @@ ompi_mtl_psm2_component_query(mca_base_module_t **module, int *priority)
|
||||
static int
|
||||
ompi_mtl_psm2_component_close(void)
|
||||
{
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
if (cuda_envvar_set) {
|
||||
opal_unsetenv("PSM2_CUDA", &environ);
|
||||
}
|
||||
#endif
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -362,6 +344,11 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
|
||||
int verno_major = PSM2_VERNO_MAJOR;
|
||||
int verno_minor = PSM2_VERNO_MINOR;
|
||||
int local_rank = -1, num_local_procs = 0;
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
int ret;
|
||||
char *cuda_env;
|
||||
glob_t globbuf = {0};
|
||||
#endif
|
||||
|
||||
/* Compute the total number of processes on this host and our local rank
|
||||
* on that node. We need to provide PSM2 with these values so it can
|
||||
@ -389,6 +376,27 @@ ompi_mtl_psm2_component_init(bool enable_progress_threads,
|
||||
ompi_mtl_psm2_set_shadow_env (ompi_mtl_psm2_shadow_variables + i);
|
||||
}
|
||||
|
||||
#if OPAL_CUDA_SUPPORT
|
||||
/*
|
||||
* If using CUDA enabled Open MPI, the user likely intends to
|
||||
* run with CUDA buffers. So, force-set the envvar here if user failed
|
||||
* to set it.
|
||||
*/
|
||||
ret = glob("/sys/module/nvidia", GLOB_DOOFFS, NULL, &globbuf);
|
||||
if (globbuf.gl_pathc > 0) {
|
||||
globfree(&globbuf);
|
||||
}
|
||||
|
||||
cuda_env = getenv("PSM2_CUDA");
|
||||
if (!cuda_env && (0 == ret)) {
|
||||
opal_show_help("help-mtl-psm2.txt",
|
||||
"no psm2 cuda env", true,
|
||||
ompi_process_info.nodename);
|
||||
opal_setenv("PSM2_CUDA", "1", false, &environ);
|
||||
cuda_envvar_set = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
err = psm2_init(&verno_major, &verno_minor);
|
||||
if (err) {
|
||||
opal_show_help("help-mtl-psm2.txt",
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user