From 0461826a4b9249a6d596bfba476146788a031372 Mon Sep 17 00:00:00 2001 From: Samuel Gutierrez Date: Tue, 9 Oct 2012 20:48:04 +0000 Subject: [PATCH] Fix bus errors caused by an inadequate amount of space during opal_shmem_segment_create by testing whether or not the target mount has enough space to accommodate the shared-memory backing store. Fixes trac:2827. Will work with Shiqing to add Windows support (if required). This commit was SVN r27433. The following Trac tickets were found above: Ticket 2827 --> https://svn.open-mpi.org/trac/ompi/ticket/2827 --- opal/mca/shmem/mmap/help-opal-shmem-mmap.txt | 11 ++- opal/mca/shmem/mmap/shmem_mmap_module.c | 78 +++++++++++++++++++- opal/util/path.c | 54 ++++++++++++++ opal/util/path.h | 18 ++++- 4 files changed, 158 insertions(+), 3 deletions(-) diff --git a/opal/mca/shmem/mmap/help-opal-shmem-mmap.txt b/opal/mca/shmem/mmap/help-opal-shmem-mmap.txt index 3340699dc3..55a2014853 100644 --- a/opal/mca/shmem/mmap/help-opal-shmem-mmap.txt +++ b/opal/mca/shmem/mmap/help-opal-shmem-mmap.txt @@ -1,7 +1,7 @@ # -*- text -*- # # Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2010 Los Alamos National Security, LLC. +# Copyright (c) 2010-2012 Los Alamos National Security, LLC. # All rights reserved. # # $COPYRIGHT$ @@ -42,3 +42,12 @@ the MCA parameter "orte_no_session_dir". You can set the MCA paramter shmem_mmap_enable_nfs_warning to 0 to disable this message. +# +[target full] +It appears as if there is not enough space for %s (the shared-memory backing +file). It is likely that your MPI job will now either abort or experience +performance degradation. + + Local host: %s + Space Requested: %ld B + Space Available: %ld B diff --git a/opal/mca/shmem/mmap/shmem_mmap_module.c b/opal/mca/shmem/mmap/shmem_mmap_module.c index f6fb514657..d18ab4838a 100644 --- a/opal/mca/shmem/mmap/shmem_mmap_module.c +++ b/opal/mca/shmem/mmap/shmem_mmap_module.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2010-2011 Los Alamos National Security, LLC. + * Copyright (c) 2010-2012 Los Alamos National Security, LLC. * All rights reserved. * * $COPYRIGHT$ @@ -88,6 +88,11 @@ segment_unlink(opal_shmem_ds_t *ds_buf); static int module_finalize(void); +static int +enough_space(const char *filename, + long space_req, + bool *result); + /* * mmap shmem module */ @@ -132,6 +137,57 @@ shmem_ds_reset(opal_shmem_ds_t *ds_buf) ds_buf->seg_base_addr = (unsigned char *)MAP_FAILED; } +/* ////////////////////////////////////////////////////////////////////////// */ +static int +enough_space(const char *filename, + long space_req, + bool *result) +{ + long avail = 0; + long fluff = (long)(.05 * space_req); + bool enough = false; + char *last_sep = NULL; + /* the target file name is passed here, but we need to check the parent + * directory. store it so we can extract that info later. */ + char *target_dir = strdup(filename); + int rc; + + if (NULL == target_dir) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + goto out; + } + /* get the parent directory */ + last_sep = strrchr(target_dir, OPAL_PATH_SEP[0]); + *last_sep = '\0'; + /* now check space availability */ + if (OPAL_SUCCESS != (rc = opal_path_df(target_dir, &avail))) { + OPAL_OUTPUT_VERBOSE( + (70, opal_shmem_base_output, + "WARNING: opal_path_df failure!") + ); + goto out; + } + /* do we have enough space? */ + if (avail >= space_req + fluff) { + enough = true; + } + else { + OPAL_OUTPUT_VERBOSE( + (70, opal_shmem_base_output, + "WARNING: not enough space on %s to meet request!" + "available: %ld requested: %ld", target_dir, + avail, space_req + fluff) + ); + } + +out: + if (NULL != target_dir) { + free(target_dir); + } + *result = enough; + return rc; +} + /* ////////////////////////////////////////////////////////////////////////// */ static int module_init(void) @@ -243,6 +299,7 @@ segment_create(opal_shmem_ds_t *ds_buf, int rc = OPAL_SUCCESS; char *real_file_name = NULL; pid_t my_pid = getpid(); + bool space_available = false; /* the real size of the shared memory segment. this includes enough space * to store our segment header. */ @@ -311,6 +368,25 @@ segment_create(opal_shmem_ds_t *ds_buf, opal_show_help("help-opal-shmem-mmap.txt", "mmap on nfs", 1, hn, real_file_name); } + /* let's make sure we have enough space for the backing file */ + if (OPAL_SUCCESS != (rc = enough_space(real_file_name, + (long)real_size, + &space_available))) { + opal_output(0, "shmem: mmap: an error occurred while determining " + "whether or not %s could be created.", real_file_name); + /* rc is set */ + goto out; + } + if (!space_available) { + char hn[MAXHOSTNAMELEN]; + gethostname(hn, MAXHOSTNAMELEN - 1); + hn[MAXHOSTNAMELEN - 1] = '\0'; + rc = OPAL_ERR_OUT_OF_RESOURCE; + opal_show_help("help-opal-shmem-mmap.txt", "target full", 1, + real_file_name, hn, (long)real_size, space_available); + goto out; + } + /* enough space is available, so create the segment */ if (-1 == (ds_buf->seg_id = open(real_file_name, O_CREAT | O_RDWR, 0600))) { int err = errno; char hn[MAXHOSTNAMELEN]; diff --git a/opal/util/path.c b/opal/util/path.c index b7b93baf8c..34b9c81006 100644 --- a/opal/util/path.c +++ b/opal/util/path.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -533,3 +535,55 @@ found: #endif /* __WINDOWS__ */ } +int +opal_path_df(const char *path, + long *out_avail) +{ +#if !defined(__WINDOWS__) + int rc = -1; + int trials = 5; + int err = 0; +#if defined(__SVR4) && defined(__sun) + struct statvfs buf; +#elif defined(__linux__) || defined (__BSD) || \ + (defined(__APPLE__) && defined(__MACH__)) + struct statfs buf; +#endif + + if (NULL == path || NULL == out_avail) { + return OPAL_ERROR; + } + + do { +#if defined(__SVR4) && defined(__sun) + rc = statvfs(path, &buf); +#elif defined(__linux__) || defined (__BSD) || \ + (defined(__APPLE__) && defined(__MACH__)) + rc = statfs(path, &buf); +#endif + err = errno; + } while (-1 == rc && ESTALE == err && (--trials > 0)); + + if (-1 == rc) { + OPAL_OUTPUT_VERBOSE((10, 2, "opal_path_df: stat(v)fs on " + "path: %s failed with errno: %d (%s)\n", + path, err, strerror(err))); + return OPAL_ERROR; + } + + /* now set the amount of free space available on path */ + /* sometimes buf.f_bavail is negative */ + *out_avail = buf.f_bsize * ((int)buf.f_bavail < 0 ? 0 : buf.f_bavail); + + OPAL_OUTPUT_VERBOSE((10, 2, "opal_path_df: stat(v)fs states " + "path: %s has %ld B of free space.", + path, *out_avail)); + + return OPAL_SUCCESS; + +#else /* defined __WINDOWS__ */ + /* FIXME if need Windows support */ + *out_avail = 0; + return OPAL_SUCCESS; +#endif /* !defined(__WINDOWS__) */ +} diff --git a/opal/util/path.h b/opal/util/path.h index 9420b35286..e14b1bc2a0 100644 --- a/opal/util/path.h +++ b/opal/util/path.h @@ -9,6 +9,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2012 Los Alamos National Security, LLC. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -23,6 +25,8 @@ #include "opal_config.h" +#include "opal/constants.h" + #ifdef HAVE_UNISTD_H #include #endif @@ -136,6 +140,18 @@ OPAL_DECLSPEC char *opal_path_access(char *fname, char *path, int mode) __opal_a */ OPAL_DECLSPEC bool opal_path_nfs(char *fname) __opal_attribute_warn_unused_result__; +/** + * @brief Returns the disk usage of path. + * + * @param[in] path Path to check + * @out_avail[out] Amount of free space available on path (if successful) + * + * @retval OPAL_SUCCESS If the operation was successful + * @retval OPAL_ERROR otherwise + */ +OPAL_DECLSPEC int +opal_path_df(const char *path, + long *out_avail)__opal_attribute_warn_unused_result__; + END_C_DECLS #endif /* OPAL_PATH_H */ -